diff --git a/cmake/FindMkldnn.cmake b/cmake/FindMkldnn.cmake new file mode 100644 index 000000000..6acba9d8c --- /dev/null +++ b/cmake/FindMkldnn.cmake @@ -0,0 +1,18 @@ +# Intel mkl-dnn support. +# Link: https://github.com/intel/mkl-dnn +if (MKLDNN) + set(MKLDNN_FIND_QUIETLY TRUE) + set(MKLDNN_INCLUDES ${MKLDNN}/include) + set(MKLDNN_LIBRARIES ${MKLDNN}/lib) +endif (MKLDNN) +find_path(MKLDNN + NAMES + mkldnn.h + PATHS + $ENV{MKLDNNDIR}/include + ${INCLUDE_INSTALL_DIR} + ) +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MKLDNN DEFAULT_MSG + MKLDNN) +mark_as_advanced(MKLDNN) \ No newline at end of file diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 14a2d0f0d..420afe650 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -75,6 +75,10 @@ typedef unsigned __int64 uint64_t; #include "libxsmm.h" #endif +#if defined(EIGEN_USE_MKLDNN) +#include "mkldnn.h" +#endif + #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif @@ -121,6 +125,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionMkldnn.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h new file mode 100644 index 000000000..a97f043c1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Eugene Zhulenev +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H + +#if defined(EIGEN_USE_MKLDNN) +// Support for MklDnn sgemm kernel in Tensor contractions: +// +// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using +// DataMapper (see TensorContractionInputMapper). +// 2. Invoke gemm kernel with packed blocks (replacement for default +// gebp_kernel). + +namespace Eigen { +namespace internal { + +template +struct mkldnn_gemm_pack; + +// mkl_gemm_pack for ColMajor storage order. +template +struct mkldnn_gemm_pack { + typedef typename internal::packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + + enum { PacketSize = internal::packet_traits::size }; + + EIGEN_DONT_INLINE + void operator()(Scalar *block, const DataMapper &data_mapper, + StorageIndex rows, StorageIndex cols) { + const StorageIndex unrolled_rows = + (rows / (4 * PacketSize)) * (4 * PacketSize); + const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize; + + for (StorageIndex col = 0; col < cols; ++col) { + LinearMapper lm = data_mapper.getLinearMapper(0, col); + + // Give compiler a strong possibility to unroll the loop. + for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; ++j) { + const Packet p = lm.template loadPacket(i + j * PacketSize); + internal::pstoreu(block + j * PacketSize, p); + } + block += 4 * PacketSize; + } + + // Process remaining rows with packets. + for (StorageIndex i = unrolled_rows; i < vectorized_rows; + i += PacketSize) { + const Packet p = lm.template loadPacket(i); + internal::pstoreu(block, p); + block += PacketSize; + } + + // Finalize with coefficients. + for (StorageIndex i = vectorized_rows; i < rows; ++i) { + *block = lm(i); + ++block; + } + } + } +}; + +template +struct mkldnn_gemm_kernel; + +// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm. +template +struct mkldnn_gemm_kernel { + EIGEN_DONT_INLINE + void operator()(const OutputMapper &output, const float *blockA, + const float *blockB, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + float alpha) { + static const int max_index = (std::numeric_limits::max)(); + + eigen_assert(max_index > rows); + eigen_assert(max_index > cols); + eigen_assert(max_index > depth); + eigen_assert(max_index > output.stride()); + + const int m = static_cast(rows); + const int n = static_cast(cols); + const int k = static_cast(depth); + + const char transposeA = ConjugateLhs ? 'Y' : 'N'; + const char transposeB = ConjugateRhs ? 'Y' : 'N'; + + const int ldA = ConjugateLhs ? k : m; + const int ldB = ConjugateRhs ? n : k; + const int ldC = static_cast(output.stride()); + + const float beta = 1.0; + + mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, + &alpha, blockA, &ldA, blockB, &ldB, &beta, + const_cast(output.data()), &ldC); + eigen_assert(st == 0); + } +}; + +} // namespace internal +} // namespace Eigen +#endif // EIGEN_USE_MKLDNN +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 0980854b4..0c4d2f0bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,6 +15,177 @@ namespace Eigen { +namespace internal { + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen thread pool evaluator +// for scaling. Assumption is that custom gemm do not use it's own threading for +// parallelisation. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +// +// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction +// evaluator. +template +struct TensorContractionKernel { + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + EIGEN_DONT_INLINE + static void packLhs(LhsScalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, depth, rows); + } + + EIGEN_DONT_INLINE + static void packRhs(RhsScalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, + const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, + const StorageIndex rows, const StorageIndex depth, + const StorageIndex cols, const ResScalar alpha) { + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ -1, /*strideB*/ -1, + /*offsetA*/ 0, /*offsetB*/ 0); + } +}; + +// Some tensor contraction kernels might rely on the gemm libraries that are +// optimized for a specific dimension sizes. By default Eigen picks block +// sizes to fit the working set in the L1/L2 caches, by specializing we can +// refine this choice and round up these sizes to work well with underlying gemm +// library. +// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate? +template +struct TensorContractionKernelBlocking { + static void refine(const StorageIndex /*m*/, + const StorageIndex /*n*/, + const StorageIndex /*k*/, + StorageIndex* /*bm*/, + StorageIndex* /*bn*/, + StorageIndex* /*bk*/) { + // By default we do nothing and stick to the block sizes picked by Eigen. + } +}; + +#if defined(EIGEN_USE_MKLDNN) +// If all scalar types in tensor contraction are floats, we can use mkldnn gemm +// as our low level kernel. +template +struct TensorContractionKernel { + // For now mkldnn has only mkldnn_sgemm (gemm for floats). + typedef float Scalar; + + typedef typename internal::gebp_traits Traits; + + typedef internal::mkldnn_gemm_pack + LhsPacker; + + typedef internal::mkldnn_gemm_pack + RhsPacker; + + typedef internal::mkldnn_gemm_kernel + GemmKernel; + + EIGEN_DONT_INLINE + static void packLhs(Scalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + StorageIndex depth, StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, rows, depth); + } + + EIGEN_DONT_INLINE + static void packRhs(Scalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, + const Scalar* rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const Scalar alpha) { + GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); + } +}; + +// For mkldnn_sgemm having the right dimensions (especially for small matrices) +// is more important than fitting all the working set in L1/L2 caches. +template +struct TensorContractionKernelBlocking { + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest. + static const StorageIndex kUnrollM = 48; + // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest + // number that divides to both of them. + static const StorageIndex kUnrollN = 24; + + static void refine(const StorageIndex m, + const StorageIndex n, + const StorageIndex /*k*/, + StorageIndex* bm, + StorageIndex* bn, + StorageIndex* /*bk*/) { + // TODO(ezhulenev): There is probably a better way to pick block sizes. + *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); + *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); + // Stick with default bk. + } +}; + +#endif // EIGEN_USE_MKLDNN +} // namespace internal + template struct TensorEvaluator, ThreadPoolDevice> : public TensorContractionEvaluatorBase, ThreadPoolDevice> > { @@ -175,6 +346,10 @@ struct TensorEvaluator::refine(m, n, k, &bm, + &bn, &bk); // Number of kernels for each dimension. Index nm0 = divup(m, bm); @@ -242,17 +417,12 @@ struct TensorEvaluator::size, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> RhsMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; + typedef internal::blas_data_mapper OutputMapper; - typedef internal::gebp_kernel - GebpKernel; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, @@ -434,8 +604,9 @@ struct TensorEvaluator +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +// Nothing to test here if we do not have mkldnn enabled. +#if defined(EIGEN_USE_MKLDNN) + +#include + +using Eigen::array; +using Eigen::ColMajor; +using Eigen::Tensor; +using Eigen::Index; +using Eigen::internal::blas_data_mapper; +using Eigen::internal::mkldnn_gemm_kernel; +using Eigen::internal::mkldnn_gemm_pack; + +template +static array RandomDims(int min_dim = 1, int max_dim = 20) { + array dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(min_dim, max_dim); + } + return dims; +} + +// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional +// Tensor. +template +static void test_mkldnn_gemm_pack() { + static const int Options = 0 | ColMajor; + + typedef blas_data_mapper DataMapper; + typedef mkldnn_gemm_pack MkldnnGemmPack; + typedef Tensor Tensor2d; + + array dims = RandomDims<2>(1, 500); + + // Create a tensor initialized with random data. + Tensor2d src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + array slice_start = RandomDims<2>(0, 250); + array slice_size = RandomDims<2>(100, 500); + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < 2; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + // Prepare tensors for packing and slicing results. + Tensor2d pack_dst(slice_size[0], slice_size[1]); + Tensor2d slice_dst(slice_size[0], slice_size[1]); + + // Pack memory using mkldnn_gemm_pack. + DataMapper data_mapper(src.data(), dims[0]); + MkldnnGemmPack gemm_pack; + gemm_pack(pack_dst.data(), + data_mapper.getSubMapper(slice_start[0], slice_start[1]), + slice_size[0], slice_size[1]); + // Slice the source tensor. + slice_dst = src.slice(slice_start, slice_size); + + // Verify that dst tensors are equal. + VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(), + slice_dst.dimensions().TotalSize()); + for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) { + Scalar packed = pack_dst.coeff(i); + Scalar sliced = slice_dst.coeff(i); + VERIFY_IS_EQUAL(packed, sliced); + } +} +template +static void test_mkldnn_gemm_kernel() { + static const int Options = 0 | ColMajor; + + typedef Tensor Tensor2d; + + int m = internal::random(1, 100); + int n = internal::random(1, 100); + int k = internal::random(1, 100); + + Tensor2d lhs(m, k); + lhs.setRandom(); + + Tensor2d rhs(k, n); + rhs.setRandom(); + + // Compute matmul with mkldnn gemm kernel. + typedef blas_data_mapper OutputMapper; + typedef mkldnn_gemm_kernel + MkldnnGemmKernel; + + Tensor2d mkldnn_result(m, n); + mkldnn_result.setZero(); + + OutputMapper output_mapper(mkldnn_result.data(), m); + MkldnnGemmKernel gemm_kernel; + gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0); + + // Compute matmul with Eigen::Matrix. + typedef Eigen::Matrix Matrix; + typedef Map > MatrixMap; + + MatrixMap lhs_mat(lhs.data(), m, k); + MatrixMap rhs_mat(rhs.data(), k, n); + Matrix matmul_result(m, n); + matmul_result.setZero(); + + matmul_result = lhs_mat * rhs_mat; + + static const float error_threshold = 1e-4f; + + // Verify that results are equal. + for (Index i = 0; i < m * n; ++i) { + Scalar gemm = mkldnn_result(i); + Scalar matmul = matmul_result(i % m, i / m); + if ((std::abs)(gemm) > error_threshold && + (std::abs)(matmul) > error_threshold) { + if (!Eigen::internal::isApprox(gemm, matmul, error_threshold)) + std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl; + VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold)); + } + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) { + CALL_SUBTEST(test_mkldnn_gemm_pack()); + CALL_SUBTEST(test_mkldnn_gemm_kernel()); +} +#else +EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {} +#endif // EIGEN_USE_MKLDNN