mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-22 01:29:35 +08:00
Remove explicit mkldnn support and redundant TensorContractionKernelBlocking
This commit is contained in:
parent
b314376f9c
commit
9f4988959f
@ -1,18 +0,0 @@
|
|||||||
# Intel mkl-dnn support.
|
|
||||||
# Link: https://github.com/intel/mkl-dnn
|
|
||||||
if (MKLDNN)
|
|
||||||
set(MKLDNN_FIND_QUIETLY TRUE)
|
|
||||||
set(MKLDNN_INCLUDES ${MKLDNN}/include)
|
|
||||||
set(MKLDNN_LIBRARIES ${MKLDNN}/lib)
|
|
||||||
endif (MKLDNN)
|
|
||||||
find_path(MKLDNN
|
|
||||||
NAMES
|
|
||||||
mkldnn.h
|
|
||||||
PATHS
|
|
||||||
$ENV{MKLDNNDIR}/include
|
|
||||||
${INCLUDE_INSTALL_DIR}
|
|
||||||
)
|
|
||||||
include(FindPackageHandleStandardArgs)
|
|
||||||
find_package_handle_standard_args(MKLDNN DEFAULT_MSG
|
|
||||||
MKLDNN)
|
|
||||||
mark_as_advanced(MKLDNN)
|
|
@ -75,10 +75,6 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#include "libxsmm.h"
|
#include "libxsmm.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_USE_MKLDNN)
|
|
||||||
#include "mkldnn.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
#include "ThreadPool"
|
#include "ThreadPool"
|
||||||
#endif
|
#endif
|
||||||
@ -125,7 +121,6 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#include "src/Tensor/TensorArgMax.h"
|
#include "src/Tensor/TensorArgMax.h"
|
||||||
#include "src/Tensor/TensorConcatenation.h"
|
#include "src/Tensor/TensorConcatenation.h"
|
||||||
#include "src/Tensor/TensorContractionMapper.h"
|
#include "src/Tensor/TensorContractionMapper.h"
|
||||||
#include "src/Tensor/TensorContractionMkldnn.h"
|
|
||||||
#include "src/Tensor/TensorContractionBlocking.h"
|
#include "src/Tensor/TensorContractionBlocking.h"
|
||||||
#include "src/Tensor/TensorContraction.h"
|
#include "src/Tensor/TensorContraction.h"
|
||||||
#include "src/Tensor/TensorContractionThreadPool.h"
|
#include "src/Tensor/TensorContractionThreadPool.h"
|
||||||
|
@ -136,6 +136,81 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
|
|||||||
static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
|
static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
|
||||||
|
// ColMajor storage order. This property is guaranteed by the
|
||||||
|
// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
|
||||||
|
// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
|
||||||
|
// multiplication for these blocks. Default tensor contraction uses
|
||||||
|
// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
|
||||||
|
// GeneralBlocPanelKernel.h for details).
|
||||||
|
//
|
||||||
|
// By specializing contraction kernels we can use other low level libraries to
|
||||||
|
// perform matrix multiplication, and still rely on Eigen contraction evaluator.
|
||||||
|
// This also includes full support in TensorContractionThreadPool, assuming that
|
||||||
|
// underlying gemm do not use it's own threading.
|
||||||
|
//
|
||||||
|
// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
|
||||||
|
// multiplication, lhs tensor and rhs tensor respectively.
|
||||||
|
//
|
||||||
|
// - StorageIndex - index type for the tensor expressions. In practice almost
|
||||||
|
// always is Eigen::Index.
|
||||||
|
//
|
||||||
|
// - OutputMapper provides access to the memory of the output matrix. In
|
||||||
|
// practice it's always column major blas_data_mapper (it must be of ResScalar
|
||||||
|
// type).
|
||||||
|
//
|
||||||
|
// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
|
||||||
|
// view into the Lhs/Rhs tensor expressions. In practice it's
|
||||||
|
// TensorContractionInputMapper, or some specialization of it based on the
|
||||||
|
// type of tensor expression (e.g. TensorImagePatchOp has optimized input
|
||||||
|
// mapper).
|
||||||
|
template<typename ResScalar, typename LhsScalar, typename RhsScalar,
|
||||||
|
typename StorageIndex, typename OutputMapper, typename LhsMapper,
|
||||||
|
typename RhsMapper>
|
||||||
|
struct TensorContractionKernel {
|
||||||
|
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
|
||||||
|
|
||||||
|
typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
|
||||||
|
typename LhsMapper::SubMapper,
|
||||||
|
Traits::mr, Traits::LhsProgress,
|
||||||
|
typename Traits::LhsPacket4Packing, ColMajor>
|
||||||
|
LhsPacker;
|
||||||
|
|
||||||
|
typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
|
||||||
|
typename RhsMapper::SubMapper, Traits::nr,
|
||||||
|
ColMajor>
|
||||||
|
RhsPacker;
|
||||||
|
|
||||||
|
typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
|
||||||
|
OutputMapper, Traits::mr, Traits::nr,
|
||||||
|
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
|
||||||
|
GebpKernel;
|
||||||
|
|
||||||
|
EIGEN_DONT_INLINE
|
||||||
|
static void packLhs(LhsScalar* lhsBlock,
|
||||||
|
const typename LhsMapper::SubMapper& data_mapper,
|
||||||
|
const StorageIndex depth, const StorageIndex rows) {
|
||||||
|
LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DONT_INLINE
|
||||||
|
static void packRhs(RhsScalar* rhsBlock,
|
||||||
|
const typename RhsMapper::SubMapper& data_mapper,
|
||||||
|
const StorageIndex depth, const StorageIndex cols) {
|
||||||
|
RhsPacker()(rhsBlock, data_mapper, depth, cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DONT_INLINE
|
||||||
|
static void invoke(const OutputMapper& output_mapper,
|
||||||
|
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
|
||||||
|
const StorageIndex rows, const StorageIndex depth,
|
||||||
|
const StorageIndex cols, const ResScalar alpha) {
|
||||||
|
GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
|
||||||
|
/*strideA*/ -1, /*strideB*/ -1,
|
||||||
|
/*offsetA*/ 0, /*offsetB*/ 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
// Tensor contraction params that should enable to get from output matrix
|
// Tensor contraction params that should enable to get from output matrix
|
||||||
@ -591,13 +666,9 @@ struct TensorContractionEvaluatorBase
|
|||||||
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
||||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
||||||
|
|
||||||
// define mr, nr, and all of my data mapper types
|
// define data mappers for Lhs and Rhs
|
||||||
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
|
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
|
||||||
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
|
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
|
||||||
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
|
|
||||||
|
|
||||||
const Index nr = Traits::nr;
|
|
||||||
const Index mr = Traits::mr;
|
|
||||||
|
|
||||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||||
@ -619,11 +690,9 @@ struct TensorContractionEvaluatorBase
|
|||||||
|
|
||||||
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
|
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
|
||||||
|
|
||||||
// Declare GEBP packing and kernel structs
|
typedef internal::TensorContractionKernel<
|
||||||
internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
|
Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
|
||||||
internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
|
TensorContractionKernel;
|
||||||
|
|
||||||
internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
|
|
||||||
|
|
||||||
// initialize data mappers
|
// initialize data mappers
|
||||||
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
|
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
|
||||||
@ -635,7 +704,7 @@ struct TensorContractionEvaluatorBase
|
|||||||
OutputMapper output(buffer, m);
|
OutputMapper output(buffer, m);
|
||||||
|
|
||||||
// Sizes of the blocks to load in cache. See the Goto paper for details.
|
// Sizes of the blocks to load in cache. See the Goto paper for details.
|
||||||
internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
|
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
|
||||||
const Index kc = blocking.kc();
|
const Index kc = blocking.kc();
|
||||||
const Index mc = numext::mini(m, blocking.mc());
|
const Index mc = numext::mini(m, blocking.mc());
|
||||||
const Index nc = numext::mini(n, blocking.nc());
|
const Index nc = numext::mini(n, blocking.nc());
|
||||||
@ -651,19 +720,22 @@ struct TensorContractionEvaluatorBase
|
|||||||
for (Index k2 = 0; k2 < k; k2 += kc) {
|
for (Index k2 = 0; k2 < k; k2 += kc) {
|
||||||
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
|
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
|
||||||
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
|
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
|
||||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
|
TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2),
|
||||||
|
actual_kc, actual_mc);
|
||||||
|
|
||||||
// series of horizontal blocks
|
// series of horizontal blocks
|
||||||
for (Index j2 = 0; j2 < n; j2 += nc) {
|
for (Index j2 = 0; j2 < n; j2 += nc) {
|
||||||
// make sure we don't overshoot right edge of right matrix, then pack block
|
// make sure we don't overshoot right edge of right matrix, then pack block
|
||||||
const Index actual_nc = numext::mini(j2 + nc, n) - j2;
|
const Index actual_nc = numext::mini(j2 + nc, n) - j2;
|
||||||
pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
|
TensorContractionKernel::packRhs(blockB, rhs.getSubMapper(k2, j2),
|
||||||
|
actual_kc, actual_nc);
|
||||||
|
|
||||||
// call gebp (matrix kernel)
|
// call gebp (matrix kernel)
|
||||||
// The parameters here are copied from Eigen's GEMM implementation
|
// The parameters here are copied from Eigen's GEMM implementation
|
||||||
const OutputMapper output_mapper = output.getSubMapper(i2, j2);
|
const OutputMapper output_mapper = output.getSubMapper(i2, j2);
|
||||||
gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc,
|
TensorContractionKernel::invoke(output_mapper, blockA, blockB,
|
||||||
Scalar(1), -1, -1, 0, 0);
|
actual_mc, actual_kc, actual_nc,
|
||||||
|
Scalar(1));
|
||||||
|
|
||||||
// We are done with this [i2, j2] output block.
|
// We are done with this [i2, j2] output block.
|
||||||
if (k2 + kc >= k) {
|
if (k2 + kc >= k) {
|
||||||
|
@ -21,7 +21,7 @@ enum {
|
|||||||
|
|
||||||
|
|
||||||
// Default Blocking Strategy
|
// Default Blocking Strategy
|
||||||
template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
|
template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
|
||||||
class TensorContractionBlocking {
|
class TensorContractionBlocking {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ class TensorContractionBlocking {
|
|||||||
#if !defined(EIGEN_HIPCC)
|
#if !defined(EIGEN_HIPCC)
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
#endif
|
#endif
|
||||||
TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
|
TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
|
||||||
kc_(k), mc_(m), nc_(n)
|
kc_(k), mc_(m), nc_(n)
|
||||||
{
|
{
|
||||||
if (ShardingType == ShardByCol) {
|
if (ShardingType == ShardByCol) {
|
||||||
@ -53,23 +53,23 @@ class TensorContractionBlocking {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Index kc_;
|
StorageIndex kc_;
|
||||||
Index mc_;
|
StorageIndex mc_;
|
||||||
Index nc_;
|
StorageIndex nc_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_LIBXSMM)
|
#if defined(EIGEN_USE_LIBXSMM)
|
||||||
template <typename LhsScalar, typename RhsScalar, typename Index>
|
template <typename LhsScalar, typename RhsScalar, typename StorageIndex>
|
||||||
class TensorXsmmContractionBlocking {
|
class TensorXsmmContractionBlocking {
|
||||||
public:
|
public:
|
||||||
TensorXsmmContractionBlocking(Index k, Index m, Index n,
|
TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
|
||||||
size_t max_num_threads = 1, bool transposeA = false,
|
size_t max_num_threads = 1, bool transposeA = false,
|
||||||
bool transposeB = false):
|
bool transposeB = false):
|
||||||
k_(k), m_(m), n_(n), transposeA_(transposeA),
|
k_(k), m_(m), n_(n), transposeA_(transposeA),
|
||||||
@ -164,28 +164,28 @@ class TensorXsmmContractionBlocking {
|
|||||||
eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
|
eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
|
EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
|
||||||
EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
|
EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
|
||||||
EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
|
EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
|
||||||
EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
|
EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; }
|
||||||
EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
|
EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; }
|
||||||
EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
|
EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; }
|
||||||
EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
|
EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
|
||||||
EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
|
EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
|
||||||
EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
|
EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
|
||||||
EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
|
EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
|
||||||
EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
|
EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
|
||||||
EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
|
EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); }
|
||||||
EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
|
EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); }
|
||||||
EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
|
EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); }
|
||||||
EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
|
EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
|
||||||
return prefetch_;
|
return prefetch_;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Index k_, m_, n_;
|
StorageIndex k_, m_, n_;
|
||||||
Index kc_, mc_, nc_;
|
StorageIndex kc_, mc_, nc_;
|
||||||
Index outer_k_, outer_m_, outer_n_;
|
StorageIndex outer_k_, outer_m_, outer_n_;
|
||||||
bool copyA_, copyB_, transposeA_, transposeB_;
|
bool copyA_, copyB_, transposeA_, transposeB_;
|
||||||
size_t num_threads_;
|
size_t num_threads_;
|
||||||
|
|
||||||
|
@ -1,116 +0,0 @@
|
|||||||
// This file is part of Eigen, a lightweight C++ template library
|
|
||||||
// for linear algebra.
|
|
||||||
//
|
|
||||||
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
|
|
||||||
//
|
|
||||||
// This Source Code Form is subject to the terms of the Mozilla
|
|
||||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
|
|
||||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_MKLDNN)
|
|
||||||
// Support for MklDnn sgemm kernel in Tensor contractions:
|
|
||||||
//
|
|
||||||
// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
|
|
||||||
// DataMapper (see TensorContractionInputMapper).
|
|
||||||
// 2. Invoke gemm kernel with packed blocks (replacement for default
|
|
||||||
// gebp_kernel).
|
|
||||||
|
|
||||||
namespace Eigen {
|
|
||||||
namespace internal {
|
|
||||||
|
|
||||||
template <typename Scalar, typename StorageIndex, typename DataMapper,
|
|
||||||
int StorageOrder>
|
|
||||||
struct mkldnn_gemm_pack;
|
|
||||||
|
|
||||||
// mkl_gemm_pack for ColMajor storage order.
|
|
||||||
template <typename Scalar, typename StorageIndex, typename DataMapper>
|
|
||||||
struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
|
|
||||||
/*StorageOrder*/ ColMajor> {
|
|
||||||
typedef typename internal::packet_traits<Scalar>::type Packet;
|
|
||||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
||||||
|
|
||||||
enum { PacketSize = internal::packet_traits<Scalar>::size };
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
void operator()(Scalar *block, const DataMapper &data_mapper,
|
|
||||||
StorageIndex rows, StorageIndex cols) {
|
|
||||||
const StorageIndex unrolled_rows =
|
|
||||||
(rows / (4 * PacketSize)) * (4 * PacketSize);
|
|
||||||
const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
|
|
||||||
|
|
||||||
for (StorageIndex col = 0; col < cols; ++col) {
|
|
||||||
LinearMapper lm = data_mapper.getLinearMapper(0, col);
|
|
||||||
|
|
||||||
// Give compiler a strong possibility to unroll the loop.
|
|
||||||
for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
|
|
||||||
for (StorageIndex j = 0; j < 4; ++j) {
|
|
||||||
const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
|
|
||||||
internal::pstoreu(block + j * PacketSize, p);
|
|
||||||
}
|
|
||||||
block += 4 * PacketSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process remaining rows with packets.
|
|
||||||
for (StorageIndex i = unrolled_rows; i < vectorized_rows;
|
|
||||||
i += PacketSize) {
|
|
||||||
const Packet p = lm.template loadPacket<Packet>(i);
|
|
||||||
internal::pstoreu(block, p);
|
|
||||||
block += PacketSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finalize with coefficients.
|
|
||||||
for (StorageIndex i = vectorized_rows; i < rows; ++i) {
|
|
||||||
*block = lm(i);
|
|
||||||
++block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Scalar, typename StorageIndex, typename OutputMapper,
|
|
||||||
bool ConjugateLhs = false, bool ConjugateRhs = false>
|
|
||||||
struct mkldnn_gemm_kernel;
|
|
||||||
|
|
||||||
// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
|
|
||||||
template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
|
|
||||||
bool ConjugateRhs>
|
|
||||||
struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
|
|
||||||
ConjugateLhs, ConjugateRhs> {
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
void operator()(const OutputMapper &output, const float *blockA,
|
|
||||||
const float *blockB, const StorageIndex rows,
|
|
||||||
const StorageIndex depth, const StorageIndex cols,
|
|
||||||
float alpha) {
|
|
||||||
static const int max_index = (std::numeric_limits<int>::max)();
|
|
||||||
|
|
||||||
eigen_assert(max_index > rows);
|
|
||||||
eigen_assert(max_index > cols);
|
|
||||||
eigen_assert(max_index > depth);
|
|
||||||
eigen_assert(max_index > output.stride());
|
|
||||||
|
|
||||||
const int m = static_cast<int>(rows);
|
|
||||||
const int n = static_cast<int>(cols);
|
|
||||||
const int k = static_cast<int>(depth);
|
|
||||||
|
|
||||||
const char transposeA = ConjugateLhs ? 'Y' : 'N';
|
|
||||||
const char transposeB = ConjugateRhs ? 'Y' : 'N';
|
|
||||||
|
|
||||||
const int ldA = ConjugateLhs ? k : m;
|
|
||||||
const int ldB = ConjugateRhs ? n : k;
|
|
||||||
const int ldC = static_cast<int>(output.stride());
|
|
||||||
|
|
||||||
const float beta = 1.0;
|
|
||||||
|
|
||||||
mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
|
|
||||||
&alpha, blockA, &ldA, blockB, &ldB, &beta,
|
|
||||||
const_cast<float*>(output.data()), &ldC);
|
|
||||||
eigen_assert(st == 0);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace internal
|
|
||||||
} // namespace Eigen
|
|
||||||
#endif // EIGEN_USE_MKLDNN
|
|
||||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
|
|
@ -15,177 +15,6 @@
|
|||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
|
||||||
|
|
||||||
// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
|
|
||||||
// ColMajor storage order. This property is guaranteed by the
|
|
||||||
// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
|
|
||||||
// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
|
|
||||||
// multiplication for these blocks. Default tensor contraction uses
|
|
||||||
// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
|
|
||||||
// GeneralBlocPanelKernel.h for details).
|
|
||||||
//
|
|
||||||
// By specializing contraction kernels we can use other low level libraries to
|
|
||||||
// perform matrix multiplication, and still rely on Eigen thread pool evaluator
|
|
||||||
// for scaling. Assumption is that custom gemm do not use it's own threading for
|
|
||||||
// parallelisation.
|
|
||||||
//
|
|
||||||
// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
|
|
||||||
// multiplication, lhs tensor and rhs tensor respectively.
|
|
||||||
//
|
|
||||||
// - StorageIndex - index type for the tensor expressions. In practice almost
|
|
||||||
// always is Eigen::Index.
|
|
||||||
//
|
|
||||||
// - OutputMapper provides access to the memory of the output matrix. In
|
|
||||||
// practice it's always column major blas_data_mapper (it must be of ResScalar
|
|
||||||
// type).
|
|
||||||
//
|
|
||||||
// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
|
|
||||||
// view into the Lhs/Rhs tensor expressions. In practice it's
|
|
||||||
// TensorContractionInputMapper, or some specialization of it based on the
|
|
||||||
// type of tensor expression (e.g. TensorImagePatchOp has optimized input
|
|
||||||
// mapper).
|
|
||||||
//
|
|
||||||
// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
|
|
||||||
// evaluator.
|
|
||||||
template<typename ResScalar, typename LhsScalar, typename RhsScalar,
|
|
||||||
typename StorageIndex, typename OutputMapper, typename LhsMapper,
|
|
||||||
typename RhsMapper>
|
|
||||||
struct TensorContractionKernel {
|
|
||||||
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
|
|
||||||
|
|
||||||
typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
|
|
||||||
typename LhsMapper::SubMapper,
|
|
||||||
Traits::mr, Traits::LhsProgress,
|
|
||||||
typename Traits::LhsPacket4Packing, ColMajor>
|
|
||||||
LhsPacker;
|
|
||||||
|
|
||||||
typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
|
|
||||||
typename RhsMapper::SubMapper, Traits::nr,
|
|
||||||
ColMajor>
|
|
||||||
RhsPacker;
|
|
||||||
|
|
||||||
typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
|
|
||||||
OutputMapper, Traits::mr, Traits::nr,
|
|
||||||
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
|
|
||||||
GebpKernel;
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void packLhs(LhsScalar* lhsBlock,
|
|
||||||
const typename LhsMapper::SubMapper& data_mapper,
|
|
||||||
const StorageIndex depth, const StorageIndex rows) {
|
|
||||||
LhsPacker()(lhsBlock, data_mapper, depth, rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void packRhs(RhsScalar* rhsBlock,
|
|
||||||
const typename RhsMapper::SubMapper& data_mapper,
|
|
||||||
const StorageIndex depth, const StorageIndex cols) {
|
|
||||||
RhsPacker()(rhsBlock, data_mapper, depth, cols);
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void invoke(const OutputMapper& output_mapper,
|
|
||||||
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
|
|
||||||
const StorageIndex rows, const StorageIndex depth,
|
|
||||||
const StorageIndex cols, const ResScalar alpha) {
|
|
||||||
GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
|
|
||||||
/*strideA*/ -1, /*strideB*/ -1,
|
|
||||||
/*offsetA*/ 0, /*offsetB*/ 0);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Some tensor contraction kernels might rely on the gemm libraries that are
|
|
||||||
// optimized for a specific dimension sizes. By default Eigen picks block
|
|
||||||
// sizes to fit the working set in the L1/L2 caches, by specializing we can
|
|
||||||
// refine this choice and round up these sizes to work well with underlying gemm
|
|
||||||
// library.
|
|
||||||
// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
|
|
||||||
template<typename ResScalar, typename LhsScalar, typename RhsScalar,
|
|
||||||
typename StorageIndex>
|
|
||||||
struct TensorContractionKernelBlocking {
|
|
||||||
static void refine(const StorageIndex /*m*/,
|
|
||||||
const StorageIndex /*n*/,
|
|
||||||
const StorageIndex /*k*/,
|
|
||||||
StorageIndex* /*bm*/,
|
|
||||||
StorageIndex* /*bn*/,
|
|
||||||
StorageIndex* /*bk*/) {
|
|
||||||
// By default we do nothing and stick to the block sizes picked by Eigen.
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_MKLDNN)
|
|
||||||
// If all scalar types in tensor contraction are floats, we can use mkldnn gemm
|
|
||||||
// as our low level kernel.
|
|
||||||
template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
|
|
||||||
typename RhsMapper>
|
|
||||||
struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
|
|
||||||
LhsMapper, RhsMapper> {
|
|
||||||
// For now mkldnn has only mkldnn_sgemm (gemm for floats).
|
|
||||||
typedef float Scalar;
|
|
||||||
|
|
||||||
typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
|
|
||||||
|
|
||||||
typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
|
|
||||||
typename LhsMapper::SubMapper, ColMajor>
|
|
||||||
LhsPacker;
|
|
||||||
|
|
||||||
typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
|
|
||||||
typename RhsMapper::SubMapper, ColMajor>
|
|
||||||
RhsPacker;
|
|
||||||
|
|
||||||
typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
|
|
||||||
GemmKernel;
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void packLhs(Scalar* lhsBlock,
|
|
||||||
const typename LhsMapper::SubMapper& data_mapper,
|
|
||||||
StorageIndex depth, StorageIndex rows) {
|
|
||||||
LhsPacker()(lhsBlock, data_mapper, rows, depth);
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void packRhs(Scalar* rhsBlock,
|
|
||||||
const typename RhsMapper::SubMapper& data_mapper,
|
|
||||||
const StorageIndex depth, const StorageIndex cols) {
|
|
||||||
RhsPacker()(rhsBlock, data_mapper, depth, cols);
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
|
||||||
static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
|
|
||||||
const Scalar* rhsBlock, const StorageIndex rows,
|
|
||||||
const StorageIndex depth, const StorageIndex cols,
|
|
||||||
const Scalar alpha) {
|
|
||||||
GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// For mkldnn_sgemm having the right dimensions (especially for small matrices)
|
|
||||||
// is more important than fitting all the working set in L1/L2 caches.
|
|
||||||
template<typename StorageIndex>
|
|
||||||
struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
|
|
||||||
// Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
|
|
||||||
static const StorageIndex kUnrollM = 48;
|
|
||||||
// Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
|
|
||||||
// number that divides to both of them.
|
|
||||||
static const StorageIndex kUnrollN = 24;
|
|
||||||
|
|
||||||
static void refine(const StorageIndex m,
|
|
||||||
const StorageIndex n,
|
|
||||||
const StorageIndex /*k*/,
|
|
||||||
StorageIndex* bm,
|
|
||||||
StorageIndex* bn,
|
|
||||||
StorageIndex* /*bk*/) {
|
|
||||||
// TODO(ezhulenev): There is probably a better way to pick block sizes.
|
|
||||||
*bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
|
|
||||||
*bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
|
|
||||||
// Stick with default bk.
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // EIGEN_USE_MKLDNN
|
|
||||||
} // namespace internal
|
|
||||||
|
|
||||||
template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
|
template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
|
||||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
|
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
|
||||||
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
|
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
|
||||||
@ -295,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// Again, we don't know number of threads yet, so we use 2.
|
// Again, we don't know number of threads yet, so we use 2.
|
||||||
Index bm, bn, bk;
|
Index bm, bn, bk;
|
||||||
if (shard_by_col) {
|
if (shard_by_col) {
|
||||||
internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
|
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
|
||||||
internal::ShardByCol>
|
internal::ShardByCol>
|
||||||
blocking(k, m, n, 2);
|
blocking(k, m, n, 2);
|
||||||
bm = blocking.mc();
|
bm = blocking.mc();
|
||||||
bn = blocking.nc();
|
bn = blocking.nc();
|
||||||
bk = blocking.kc();
|
bk = blocking.kc();
|
||||||
} else {
|
} else {
|
||||||
internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
|
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
|
||||||
internal::ShardByRow>
|
internal::ShardByRow>
|
||||||
blocking(k, m, n, 2);
|
blocking(k, m, n, 2);
|
||||||
bm = blocking.mc();
|
bm = blocking.mc();
|
||||||
@ -332,24 +161,20 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// Now that we know number of threads, recalculate sharding and blocking.
|
// Now that we know number of threads, recalculate sharding and blocking.
|
||||||
shard_by_col = shardByCol(m, n, num_threads);
|
shard_by_col = shardByCol(m, n, num_threads);
|
||||||
if (shard_by_col) {
|
if (shard_by_col) {
|
||||||
internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
|
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
|
||||||
internal::ShardByCol>
|
internal::ShardByCol>
|
||||||
blocking(k, m, n, num_threads);
|
blocking(k, m, n, num_threads);
|
||||||
bm = blocking.mc();
|
bm = blocking.mc();
|
||||||
bn = blocking.nc();
|
bn = blocking.nc();
|
||||||
bk = blocking.kc();
|
bk = blocking.kc();
|
||||||
} else {
|
} else {
|
||||||
internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
|
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
|
||||||
internal::ShardByRow>
|
internal::ShardByRow>
|
||||||
blocking(k, m, n, num_threads);
|
blocking(k, m, n, num_threads);
|
||||||
bm = blocking.mc();
|
bm = blocking.mc();
|
||||||
bn = blocking.nc();
|
bn = blocking.nc();
|
||||||
bk = blocking.kc();
|
bk = blocking.kc();
|
||||||
}
|
}
|
||||||
// Refine blocking choice to work well with contraction kernel.
|
|
||||||
internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
|
|
||||||
Index>::refine(m, n, k, &bm,
|
|
||||||
&bn, &bk);
|
|
||||||
|
|
||||||
// Number of kernels for each dimension.
|
// Number of kernels for each dimension.
|
||||||
Index nm0 = divup(m, bm);
|
Index nm0 = divup(m, bm);
|
||||||
|
@ -23,17 +23,6 @@ else(XSMM_FOUND)
|
|||||||
ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
|
ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
|
||||||
endif(XSMM_FOUND)
|
endif(XSMM_FOUND)
|
||||||
|
|
||||||
find_package(Mkldnn)
|
|
||||||
if(MKLDNN_FOUND)
|
|
||||||
add_definitions("-DEIGEN_USE_MKLDNN")
|
|
||||||
include_directories(${MKLDNN_INCLUDES})
|
|
||||||
link_directories(${MKLDNN_LIBRARIES})
|
|
||||||
set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn)
|
|
||||||
ei_add_property(EIGEN_TESTED_BACKENDS "Mkldd, ")
|
|
||||||
else(MKLDNN_FOUND)
|
|
||||||
ei_add_property(EIGEN_MISSING_BACKENDS "Mkldnn, ")
|
|
||||||
endif(MKLDNN_FOUND)
|
|
||||||
|
|
||||||
find_package(GoogleHash)
|
find_package(GoogleHash)
|
||||||
if(GOOGLEHASH_FOUND)
|
if(GOOGLEHASH_FOUND)
|
||||||
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
|
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
|
||||||
@ -191,10 +180,6 @@ if(EIGEN_TEST_CXX11)
|
|||||||
ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
|
ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
|
||||||
endif(EIGEN_TEST_SYCL)
|
endif(EIGEN_TEST_SYCL)
|
||||||
|
|
||||||
if (MKLDNN_FOUND)
|
|
||||||
ei_add_test(cxx11_tensor_contraction_mkldnn)
|
|
||||||
endif (MKLDNN_FOUND)
|
|
||||||
|
|
||||||
ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
|
@ -1,134 +0,0 @@
|
|||||||
// This file is part of Eigen, a lightweight C++ template library
|
|
||||||
// for linear algebra.
|
|
||||||
//
|
|
||||||
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
|
|
||||||
//
|
|
||||||
// This Source Code Form is subject to the terms of the Mozilla
|
|
||||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
|
|
||||||
#include "main.h"
|
|
||||||
|
|
||||||
#include <Eigen/CXX11/Tensor>
|
|
||||||
|
|
||||||
using Eigen::internal::blas_data_mapper;
|
|
||||||
using Eigen::internal::mkldnn_gemm_kernel;
|
|
||||||
using Eigen::internal::mkldnn_gemm_pack;
|
|
||||||
|
|
||||||
template <int NumDims>
|
|
||||||
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
|
||||||
array<Index, NumDims> dims;
|
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
|
||||||
dims[i] = internal::random<int>(min_dim, max_dim);
|
|
||||||
}
|
|
||||||
return dims;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
|
|
||||||
// Tensor.
|
|
||||||
template <typename Scalar>
|
|
||||||
static void test_mkldnn_gemm_pack() {
|
|
||||||
static const int Options = 0 | ColMajor;
|
|
||||||
|
|
||||||
typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper;
|
|
||||||
typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack;
|
|
||||||
typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
|
|
||||||
|
|
||||||
array<Index, 2> dims = RandomDims<2>(1, 500);
|
|
||||||
|
|
||||||
// Create a tensor initialized with random data.
|
|
||||||
Tensor2d src(dims);
|
|
||||||
src.setRandom();
|
|
||||||
|
|
||||||
// Pick a random slice of src tensor.
|
|
||||||
array<Index, 2> slice_start = RandomDims<2>(0, 250);
|
|
||||||
array<Index, 2> slice_size = RandomDims<2>(100, 500);
|
|
||||||
// Make sure that slice start + size do not overflow tensor dims.
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
|
||||||
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
|
||||||
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare tensors for packing and slicing results.
|
|
||||||
Tensor2d pack_dst(slice_size[0], slice_size[1]);
|
|
||||||
Tensor2d slice_dst(slice_size[0], slice_size[1]);
|
|
||||||
|
|
||||||
// Pack memory using mkldnn_gemm_pack.
|
|
||||||
DataMapper data_mapper(src.data(), dims[0]);
|
|
||||||
MkldnnGemmPack gemm_pack;
|
|
||||||
gemm_pack(pack_dst.data(),
|
|
||||||
data_mapper.getSubMapper(slice_start[0], slice_start[1]),
|
|
||||||
slice_size[0], slice_size[1]);
|
|
||||||
// Slice the source tensor.
|
|
||||||
slice_dst = src.slice(slice_start, slice_size);
|
|
||||||
|
|
||||||
// Verify that dst tensors are equal.
|
|
||||||
VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(),
|
|
||||||
slice_dst.dimensions().TotalSize());
|
|
||||||
for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
|
|
||||||
Scalar packed = pack_dst.coeff(i);
|
|
||||||
Scalar sliced = slice_dst.coeff(i);
|
|
||||||
VERIFY_IS_EQUAL(packed, sliced);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template <typename Scalar>
|
|
||||||
static void test_mkldnn_gemm_kernel() {
|
|
||||||
static const int Options = 0 | ColMajor;
|
|
||||||
|
|
||||||
typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
|
|
||||||
|
|
||||||
int m = internal::random<int>(1, 100);
|
|
||||||
int n = internal::random<int>(1, 100);
|
|
||||||
int k = internal::random<int>(1, 100);
|
|
||||||
|
|
||||||
Tensor2d lhs(m, k);
|
|
||||||
lhs.setRandom();
|
|
||||||
|
|
||||||
Tensor2d rhs(k, n);
|
|
||||||
rhs.setRandom();
|
|
||||||
|
|
||||||
// Compute matmul with mkldnn gemm kernel.
|
|
||||||
typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
|
|
||||||
typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>
|
|
||||||
MkldnnGemmKernel;
|
|
||||||
|
|
||||||
Tensor2d mkldnn_result(m, n);
|
|
||||||
mkldnn_result.setZero();
|
|
||||||
|
|
||||||
OutputMapper output_mapper(mkldnn_result.data(), m);
|
|
||||||
MkldnnGemmKernel gemm_kernel;
|
|
||||||
gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0);
|
|
||||||
|
|
||||||
// Compute matmul with Eigen::Matrix.
|
|
||||||
typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix;
|
|
||||||
typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap;
|
|
||||||
|
|
||||||
MatrixMap lhs_mat(lhs.data(), m, k);
|
|
||||||
MatrixMap rhs_mat(rhs.data(), k, n);
|
|
||||||
Matrix matmul_result(m, n);
|
|
||||||
matmul_result.setZero();
|
|
||||||
|
|
||||||
matmul_result = lhs_mat * rhs_mat;
|
|
||||||
|
|
||||||
static const float error_threshold = 1e-4f;
|
|
||||||
|
|
||||||
// Verify that results are equal.
|
|
||||||
for (Index i = 0; i < m * n; ++i) {
|
|
||||||
Scalar gemm = mkldnn_result(i);
|
|
||||||
Scalar matmul = matmul_result(i % m, i / m);
|
|
||||||
if ((std::abs)(gemm) > error_threshold &&
|
|
||||||
(std::abs)(matmul) > error_threshold) {
|
|
||||||
if (!Eigen::internal::isApprox(gemm, matmul, error_threshold))
|
|
||||||
std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl;
|
|
||||||
VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {
|
|
||||||
CALL_SUBTEST(test_mkldnn_gemm_pack<float>());
|
|
||||||
CALL_SUBTEST(test_mkldnn_gemm_pack<double>());
|
|
||||||
|
|
||||||
// mkldnn has only sgemm (aka gemm for floats).
|
|
||||||
CALL_SUBTEST(test_mkldnn_gemm_kernel<float>());
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user