Support multiple contraction kernel types in TensorContractionThreadPool

This commit is contained in:
Eugene Zhulenev 2018-09-26 11:08:47 -07:00
parent 0a3356f4ec
commit 71cd3fbd6a
6 changed files with 484 additions and 20 deletions

18
cmake/FindMkldnn.cmake Normal file
View File

@ -0,0 +1,18 @@
# Intel mkl-dnn support.
# Link: https://github.com/intel/mkl-dnn
if (MKLDNN)
set(MKLDNN_FIND_QUIETLY TRUE)
set(MKLDNN_INCLUDES ${MKLDNN}/include)
set(MKLDNN_LIBRARIES ${MKLDNN}/lib)
endif (MKLDNN)
find_path(MKLDNN
NAMES
mkldnn.h
PATHS
$ENV{MKLDNNDIR}/include
${INCLUDE_INSTALL_DIR}
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MKLDNN DEFAULT_MSG
MKLDNN)
mark_as_advanced(MKLDNN)

View File

@ -75,6 +75,10 @@ typedef unsigned __int64 uint64_t;
#include "libxsmm.h" #include "libxsmm.h"
#endif #endif
#if defined(EIGEN_USE_MKLDNN)
#include "mkldnn.h"
#endif
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
#include "ThreadPool" #include "ThreadPool"
#endif #endif
@ -121,6 +125,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionMkldnn.h"
#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionThreadPool.h"

View File

@ -0,0 +1,116 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
#if defined(EIGEN_USE_MKLDNN)
// Support for MklDnn sgemm kernel in Tensor contractions:
//
// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
// DataMapper (see TensorContractionInputMapper).
// 2. Invoke gemm kernel with packed blocks (replacement for default
// gebp_kernel).
namespace Eigen {
namespace internal {
template <typename Scalar, typename StorageIndex, typename DataMapper,
int StorageOrder>
struct mkldnn_gemm_pack;
// mkl_gemm_pack for ColMajor storage order.
template <typename Scalar, typename StorageIndex, typename DataMapper>
struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
/*StorageOrder*/ ColMajor> {
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = internal::packet_traits<Scalar>::size };
EIGEN_DONT_INLINE
void operator()(Scalar *block, const DataMapper &data_mapper,
StorageIndex rows, StorageIndex cols) {
const StorageIndex unrolled_rows =
(rows / (4 * PacketSize)) * (4 * PacketSize);
const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
for (StorageIndex col = 0; col < cols; ++col) {
LinearMapper lm = data_mapper.getLinearMapper(0, col);
// Give compiler a strong possibility to unroll the loop.
for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; ++j) {
const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
internal::pstoreu(block + j * PacketSize, p);
}
block += 4 * PacketSize;
}
// Process remaining rows with packets.
for (StorageIndex i = unrolled_rows; i < vectorized_rows;
i += PacketSize) {
const Packet p = lm.template loadPacket<Packet>(i);
internal::pstoreu(block, p);
block += PacketSize;
}
// Finalize with coefficients.
for (StorageIndex i = vectorized_rows; i < rows; ++i) {
*block = lm(i);
++block;
}
}
}
};
template <typename Scalar, typename StorageIndex, typename OutputMapper,
bool ConjugateLhs = false, bool ConjugateRhs = false>
struct mkldnn_gemm_kernel;
// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
bool ConjugateRhs>
struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
ConjugateLhs, ConjugateRhs> {
EIGEN_DONT_INLINE
void operator()(const OutputMapper &output, const float *blockA,
const float *blockB, const StorageIndex rows,
const StorageIndex depth, const StorageIndex cols,
float alpha) {
static const int max_index = (std::numeric_limits<int>::max)();
eigen_assert(max_index > rows);
eigen_assert(max_index > cols);
eigen_assert(max_index > depth);
eigen_assert(max_index > output.stride());
const int m = static_cast<int>(rows);
const int n = static_cast<int>(cols);
const int k = static_cast<int>(depth);
const char transposeA = ConjugateLhs ? 'Y' : 'N';
const char transposeB = ConjugateRhs ? 'Y' : 'N';
const int ldA = ConjugateLhs ? k : m;
const int ldB = ConjugateRhs ? n : k;
const int ldC = static_cast<int>(output.stride());
const float beta = 1.0;
mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
&alpha, blockA, &ldA, blockB, &ldB, &beta,
const_cast<float*>(output.data()), &ldC);
eigen_assert(st == 0);
}
};
} // namespace internal
} // namespace Eigen
#endif // EIGEN_USE_MKLDNN
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H

View File

@ -15,6 +15,177 @@
namespace Eigen { namespace Eigen {
namespace internal {
// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
// ColMajor storage order. This property is guaranteed by the
// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
// multiplication for these blocks. Default tensor contraction uses
// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
// GeneralBlocPanelKernel.h for details).
//
// By specializing contraction kernels we can use other low level libraries to
// perform matrix multiplication, and still rely on Eigen thread pool evaluator
// for scaling. Assumption is that custom gemm do not use it's own threading for
// parallelisation.
//
// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
// multiplication, lhs tensor and rhs tensor respectively.
//
// - StorageIndex - index type for the tensor expressions. In practice almost
// always is Eigen::Index.
//
// - OutputMapper provides access to the memory of the output matrix. In
// practice it's always column major blas_data_mapper (it must be of ResScalar
// type).
//
// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
// view into the Lhs/Rhs tensor expressions. In practice it's
// TensorContractionInputMapper, or some specialization of it based on the
// type of tensor expression (e.g. TensorImagePatchOp has optimized input
// mapper).
//
// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
// evaluator.
template<typename ResScalar, typename LhsScalar, typename RhsScalar,
typename StorageIndex, typename OutputMapper, typename LhsMapper,
typename RhsMapper>
struct TensorContractionKernel {
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
typename LhsMapper::SubMapper,
Traits::mr, Traits::LhsProgress,
typename Traits::LhsPacket4Packing, ColMajor>
LhsPacker;
typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
typename RhsMapper::SubMapper, Traits::nr,
ColMajor>
RhsPacker;
typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
OutputMapper, Traits::mr, Traits::nr,
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
GebpKernel;
EIGEN_DONT_INLINE
static void packLhs(LhsScalar* lhsBlock,
const typename LhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex rows) {
LhsPacker()(lhsBlock, data_mapper, depth, rows);
}
EIGEN_DONT_INLINE
static void packRhs(RhsScalar* rhsBlock,
const typename RhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex cols) {
RhsPacker()(rhsBlock, data_mapper, depth, cols);
}
EIGEN_DONT_INLINE
static void invoke(const OutputMapper& output_mapper,
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
const StorageIndex rows, const StorageIndex depth,
const StorageIndex cols, const ResScalar alpha) {
GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
/*strideA*/ -1, /*strideB*/ -1,
/*offsetA*/ 0, /*offsetB*/ 0);
}
};
// Some tensor contraction kernels might rely on the gemm libraries that are
// optimized for a specific dimension sizes. By default Eigen picks block
// sizes to fit the working set in the L1/L2 caches, by specializing we can
// refine this choice and round up these sizes to work well with underlying gemm
// library.
// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
template<typename ResScalar, typename LhsScalar, typename RhsScalar,
typename StorageIndex>
struct TensorContractionKernelBlocking {
static void refine(const StorageIndex /*m*/,
const StorageIndex /*n*/,
const StorageIndex /*k*/,
StorageIndex* /*bm*/,
StorageIndex* /*bn*/,
StorageIndex* /*bk*/) {
// By default we do nothing and stick to the block sizes picked by Eigen.
}
};
#if defined(EIGEN_USE_MKLDNN)
// If all scalar types in tensor contraction are floats, we can use mkldnn gemm
// as our low level kernel.
template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
typename RhsMapper>
struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
LhsMapper, RhsMapper> {
// For now mkldnn has only mkldnn_sgemm (gemm for floats).
typedef float Scalar;
typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
typename LhsMapper::SubMapper, ColMajor>
LhsPacker;
typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
typename RhsMapper::SubMapper, ColMajor>
RhsPacker;
typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
GemmKernel;
EIGEN_DONT_INLINE
static void packLhs(Scalar* lhsBlock,
const typename LhsMapper::SubMapper& data_mapper,
StorageIndex depth, StorageIndex rows) {
LhsPacker()(lhsBlock, data_mapper, rows, depth);
}
EIGEN_DONT_INLINE
static void packRhs(Scalar* rhsBlock,
const typename RhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex cols) {
RhsPacker()(rhsBlock, data_mapper, depth, cols);
}
EIGEN_DONT_INLINE
static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
const Scalar* rhsBlock, const StorageIndex rows,
const StorageIndex depth, const StorageIndex cols,
const Scalar alpha) {
GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
}
};
// For mkldnn_sgemm having the right dimensions (especially for small matrices)
// is more important than fitting all the working set in L1/L2 caches.
template<typename StorageIndex>
struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
// Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
static const StorageIndex kUnrollM = 48;
// Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
// number that divides to both of them.
static const StorageIndex kUnrollN = 24;
static void refine(const StorageIndex m,
const StorageIndex n,
const StorageIndex /*k*/,
StorageIndex* bm,
StorageIndex* bn,
StorageIndex* /*bk*/) {
// TODO(ezhulenev): There is probably a better way to pick block sizes.
*bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
*bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
// Stick with default bk.
}
};
#endif // EIGEN_USE_MKLDNN
} // namespace internal
template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> : struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > { public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
@ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
bn = blocking.nc(); bn = blocking.nc();
bk = blocking.kc(); bk = blocking.kc();
} }
// Refine blocking choice to work well with contraction kernel.
internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
Index>::refine(m, n, k, &bm,
&bn, &bk);
// Number of kernels for each dimension. // Number of kernels for each dimension.
Index nm0 = divup(m, bm); Index nm0 = divup(m, bm);
@ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
contract_t, internal::packet_traits<RhsScalar>::size, contract_t, internal::packet_traits<RhsScalar>::size,
rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
RhsMapper; RhsMapper;
typedef internal::gemm_pack_lhs<LhsScalar, Index,
typename LhsMapper::SubMapper, Traits::mr,
Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
LhsPacker;
typedef internal::gemm_pack_rhs<
RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
RhsPacker;
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
Traits::mr, Traits::nr, false, false> typedef internal::TensorContractionKernel<
GebpKernel; Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
TensorContractionKernel;
Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
@ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
void pack_lhs(Index m, Index k) { void pack_lhs(Index m, Index k) {
const Index mend = m * gm_ + gm(m); const Index mend = m * gm_ + gm(m);
for (Index m1 = m * gm_; m1 < mend; m1++) for (Index m1 = m * gm_; m1 < mend; m1++)
LhsPacker()(packed_lhs_[k % (P - 1)][m1], TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); lhs_.getSubMapper(m1 * bm_, k * bk_),
bk(k), bm(m1));
if (!parallel_pack_ && shard_by_col_) { if (!parallel_pack_ && shard_by_col_) {
signal_packing(k); signal_packing(k);
@ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// deadlocks. // deadlocks.
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
} }
RhsPacker()(packed_rhs_[k % (P - 1)][n1], TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); rhs_.getSubMapper(k * bk_, n1 * bn_),
bk(k), bn(n1));
} }
if (parallel_pack_ || shard_by_col_) { if (parallel_pack_ || shard_by_col_) {
@ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index n1 = n * gn_; n1 < nend; n1++) { for (Index n1 = n * gn_; n1 < nend; n1++) {
for (Index m1 = m * gm_; m1 < mend; m1++) { for (Index m1 = m * gm_; m1 < mend; m1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], TensorContractionKernel::invoke(
packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), output_mapper, packed_lhs_[k % (P - 1)][m1],
Scalar(1), -1, -1, 0, 0); packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block. // We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) { if (k + 1 == nk_) {
@ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index m1 = m * gm_; m1 < mend; m1++) for (Index m1 = m * gm_; m1 < mend; m1++)
for (Index n1 = n * gn_; n1 < nend; n1++) { for (Index n1 = n * gn_; n1 < nend; n1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1], TensorContractionKernel::invoke(
packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), output_mapper, packed_lhs_[k % (P - 1)][m1],
Scalar(1), -1, -1, 0, 0); packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block. // We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) { if (k + 1 == nk_) {

View File

@ -23,6 +23,17 @@ else(XSMM_FOUND)
ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ") ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
endif(XSMM_FOUND) endif(XSMM_FOUND)
find_package(Mkldnn)
if(MKLDNN_FOUND)
add_definitions("-DEIGEN_USE_MKLDNN")
include_directories(${MKLDNN_INCLUDES})
link_directories(${MKLDNN_LIBRARIES})
set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn)
ei_add_property(EIGEN_TESTED_BACKENDS "Mkldd, ")
else(MKLDNN_FOUND)
ei_add_property(EIGEN_MISSING_BACKENDS "Mkldnn, ")
endif(MKLDNN_FOUND)
find_package(GoogleHash) find_package(GoogleHash)
if(GOOGLEHASH_FOUND) if(GOOGLEHASH_FOUND)
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@ -190,6 +201,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_index_list) ei_add_test(cxx11_tensor_index_list)
ei_add_test(cxx11_tensor_mixed_indices) ei_add_test(cxx11_tensor_mixed_indices)
ei_add_test(cxx11_tensor_contraction) ei_add_test(cxx11_tensor_contraction)
ei_add_test(cxx11_tensor_contraction_mkldnn)
ei_add_test(cxx11_tensor_convolution) ei_add_test(cxx11_tensor_convolution)
ei_add_test(cxx11_tensor_expr) ei_add_test(cxx11_tensor_expr)
ei_add_test(cxx11_tensor_fixed_size) ei_add_test(cxx11_tensor_fixed_size)

View File

@ -0,0 +1,141 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
// Nothing to test here if we do not have mkldnn enabled.
#if defined(EIGEN_USE_MKLDNN)
#include <Eigen/CXX11/Tensor>
using Eigen::array;
using Eigen::ColMajor;
using Eigen::Tensor;
using Eigen::Index;
using Eigen::internal::blas_data_mapper;
using Eigen::internal::mkldnn_gemm_kernel;
using Eigen::internal::mkldnn_gemm_pack;
template <int NumDims>
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
array<Index, NumDims> dims;
for (int i = 0; i < NumDims; ++i) {
dims[i] = internal::random<int>(min_dim, max_dim);
}
return dims;
}
// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
// Tensor.
template <typename Scalar>
static void test_mkldnn_gemm_pack() {
static const int Options = 0 | ColMajor;
typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper;
typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack;
typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
array<Index, 2> dims = RandomDims<2>(1, 500);
// Create a tensor initialized with random data.
Tensor2d src(dims);
src.setRandom();
// Pick a random slice of src tensor.
array<Index, 2> slice_start = RandomDims<2>(0, 250);
array<Index, 2> slice_size = RandomDims<2>(100, 500);
// Make sure that slice start + size do not overflow tensor dims.
for (int i = 0; i < 2; ++i) {
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
}
// Prepare tensors for packing and slicing results.
Tensor2d pack_dst(slice_size[0], slice_size[1]);
Tensor2d slice_dst(slice_size[0], slice_size[1]);
// Pack memory using mkldnn_gemm_pack.
DataMapper data_mapper(src.data(), dims[0]);
MkldnnGemmPack gemm_pack;
gemm_pack(pack_dst.data(),
data_mapper.getSubMapper(slice_start[0], slice_start[1]),
slice_size[0], slice_size[1]);
// Slice the source tensor.
slice_dst = src.slice(slice_start, slice_size);
// Verify that dst tensors are equal.
VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(),
slice_dst.dimensions().TotalSize());
for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
Scalar packed = pack_dst.coeff(i);
Scalar sliced = slice_dst.coeff(i);
VERIFY_IS_EQUAL(packed, sliced);
}
}
template <typename Scalar>
static void test_mkldnn_gemm_kernel() {
static const int Options = 0 | ColMajor;
typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
int m = internal::random<int>(1, 100);
int n = internal::random<int>(1, 100);
int k = internal::random<int>(1, 100);
Tensor2d lhs(m, k);
lhs.setRandom();
Tensor2d rhs(k, n);
rhs.setRandom();
// Compute matmul with mkldnn gemm kernel.
typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>
MkldnnGemmKernel;
Tensor2d mkldnn_result(m, n);
mkldnn_result.setZero();
OutputMapper output_mapper(mkldnn_result.data(), m);
MkldnnGemmKernel gemm_kernel;
gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0);
// Compute matmul with Eigen::Matrix.
typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix;
typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap;
MatrixMap lhs_mat(lhs.data(), m, k);
MatrixMap rhs_mat(rhs.data(), k, n);
Matrix matmul_result(m, n);
matmul_result.setZero();
matmul_result = lhs_mat * rhs_mat;
static const float error_threshold = 1e-4f;
// Verify that results are equal.
for (Index i = 0; i < m * n; ++i) {
Scalar gemm = mkldnn_result(i);
Scalar matmul = matmul_result(i % m, i / m);
if ((std::abs)(gemm) > error_threshold &&
(std::abs)(matmul) > error_threshold) {
if (!Eigen::internal::isApprox(gemm, matmul, error_threshold))
std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl;
VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold));
}
}
}
EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {
CALL_SUBTEST(test_mkldnn_gemm_pack<float>());
CALL_SUBTEST(test_mkldnn_gemm_kernel<float>());
}
#else
EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {}
#endif // EIGEN_USE_MKLDNN