mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Adding tensor contraction operation backend for Sycl; adding test for contractionOp sycl backend; adding temporary solution to prevent memory leak in buffer; cleaning up cxx11_tensor_buildins_sycl.h
This commit is contained in:
parent
3d59a47720
commit
2d4a091beb
@ -156,9 +156,9 @@ struct TensorContractionEvaluatorBase
|
||||
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.rhsExpression(), op.lhsExpression()), device),
|
||||
m_device(device),
|
||||
m_result(NULL) {
|
||||
m_result(NULL), m_expr_indices(op.indices()) {
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
|
||||
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
|
||||
@ -327,7 +327,7 @@ struct TensorContractionEvaluatorBase
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
|
||||
m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
@ -564,6 +564,9 @@ struct TensorContractionEvaluatorBase
|
||||
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
|
||||
const Device& m_device;
|
||||
Scalar* m_result;
|
||||
/// required for sycl
|
||||
const Indices m_expr_indices;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -621,6 +624,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -22,8 +22,14 @@ enum {
|
||||
/*
|
||||
* Implementation of the Eigen blas_data_mapper class for tensors.
|
||||
*/
|
||||
/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
|
||||
/// is scalar * for CoeffLoader.
|
||||
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader;
|
||||
template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
|
||||
int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
|
||||
template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper;
|
||||
|
||||
template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
|
||||
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader {
|
||||
enum {
|
||||
DirectOffsets = false
|
||||
};
|
||||
@ -47,7 +53,7 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
|
||||
const Tensor m_tensor;
|
||||
};
|
||||
|
||||
template <typename Tensor> struct CoeffLoader<Tensor, true> {
|
||||
template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> {
|
||||
enum {
|
||||
DirectOffsets = true
|
||||
};
|
||||
@ -67,13 +73,14 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
|
||||
}
|
||||
private:
|
||||
typedef typename Tensor::Scalar Scalar;
|
||||
const Scalar* m_data;
|
||||
|
||||
typename MakePointer_<const Scalar>::Type m_data;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size, bool inner_dim_contiguous, int Alignment>
|
||||
int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
|
||||
class SimpleTensorContractionMapper {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -89,7 +96,7 @@ class SimpleTensorContractionMapper {
|
||||
m_k_strides(k_strides) { }
|
||||
|
||||
enum {
|
||||
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
|
||||
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
|
||||
@ -206,23 +213,22 @@ class SimpleTensorContractionMapper {
|
||||
}
|
||||
|
||||
protected:
|
||||
CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
|
||||
CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
|
||||
const nocontract_t m_nocontract_strides;
|
||||
const nocontract_t m_ij_strides;
|
||||
const contract_t m_contract_strides;
|
||||
const contract_t m_k_strides;
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size, bool inner_dim_contiguous,
|
||||
bool inner_dim_reordered, int Alignment>
|
||||
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
|
||||
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
|
||||
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
|
||||
{
|
||||
public:
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
BaseTensorContractionMapper(const Tensor& tensor,
|
||||
@ -307,11 +313,11 @@ template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
bool inner_dim_contiguous,
|
||||
bool inner_dim_reordered, int Alignment>
|
||||
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
|
||||
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
|
||||
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
|
||||
{
|
||||
public:
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
BaseTensorContractionMapper(const Tensor& tensor,
|
||||
@ -345,14 +351,14 @@ template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size,
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
|
||||
class TensorContractionSubMapper {
|
||||
public:
|
||||
typedef typename Tensor::PacketReturnType Packet;
|
||||
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
||||
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
|
||||
typedef Self LinearMapper;
|
||||
|
||||
enum {
|
||||
@ -452,14 +458,14 @@ template<typename Scalar_, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size,
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
|
||||
class TensorContractionInputMapper
|
||||
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
|
||||
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
|
||||
|
||||
public:
|
||||
typedef Scalar_ Scalar;
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
|
||||
typedef SubMapper VectorMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
|
||||
|
355
unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
Normal file
355
unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
Normal file
@ -0,0 +1,355 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclConvertToDeviceExpression.h
|
||||
*
|
||||
* \brief:
|
||||
* TensorContractionsycl
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
|
||||
namespace Eigen {
|
||||
|
||||
template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType>
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
|
||||
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
|
||||
|
||||
typedef const Eigen::SyclDevice Device;
|
||||
|
||||
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
|
||||
typedef TensorContractionEvaluatorBase<Self> Base;
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
};
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
// will pretend B is LHS and A is RHS.
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
|
||||
static const int LDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
|
||||
static const int RDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
|
||||
static const int ContractDims = internal::array_size<Indices>::value;
|
||||
|
||||
typedef array<Index, LDims> left_dim_mapper_t;
|
||||
typedef array<Index, RDims> right_dim_mapper_t;
|
||||
|
||||
typedef array<Index, ContractDims> contract_t;
|
||||
typedef array<Index, LDims - ContractDims> left_nocontract_t;
|
||||
typedef array<Index, RDims - ContractDims> right_nocontract_t;
|
||||
|
||||
static const int NumDims = LDims + RDims - 2 * ContractDims;
|
||||
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
// typedefs needed in evalTo
|
||||
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
|
||||
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
|
||||
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
|
||||
typedef typename LeftEvaluator::Dimensions LeftDimensions;
|
||||
typedef typename RightEvaluator::Dimensions RightDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) {}
|
||||
|
||||
// We need to redefine this method to make nvcc happy
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
evalTo(data);
|
||||
return false;
|
||||
} else {
|
||||
this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
|
||||
evalTo(this->m_result);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
const Eigen::SyclDevice& device() const {return this->m_device;}
|
||||
void evalTo(Scalar* buffer) const {
|
||||
// Here is the result
|
||||
if (this->m_lhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
evalTyped<true, true, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
evalTyped<true, true, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
evalTyped<true, false, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
evalTyped<true, false, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
evalTyped<false, true, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
evalTyped<false, true, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
evalTyped<false, false, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
evalTyped<false, false, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
void evalTyped(Scalar* buffer) const {
|
||||
// columns in left side, rows in right side
|
||||
const Index k = this->m_k_size;
|
||||
EIGEN_UNUSED_VARIABLE(k)
|
||||
// rows in left side
|
||||
const Index m = this->m_i_size;
|
||||
// columns in right side
|
||||
const Index n = this->m_j_size;
|
||||
|
||||
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
||||
LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
|
||||
this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
|
||||
this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
|
||||
}
|
||||
// required by sycl to construct the expr on the device. Returns original left_impl
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const {
|
||||
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
|
||||
}
|
||||
// required by sycl to construct the expr on the device. Returns original right_impl
|
||||
const TensorEvaluator<RightArgType, Device>& right_impl() const {
|
||||
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
|
||||
}
|
||||
// required by sycl to construct the expr on the device
|
||||
const Indices& indices() const {return this->m_expr_indices;}
|
||||
};
|
||||
|
||||
/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
|
||||
template<typename Expr> struct TensorEvaluatorContainer;
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType>
|
||||
struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
|
||||
typedef Eigen::DefaultDevice Device;
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
};
|
||||
|
||||
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
|
||||
TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
|
||||
: m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.lhsExpression(), op.rhsExpression()), device),
|
||||
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.rhsExpression(), op.lhsExpression()), device){}
|
||||
LeftEvaluator m_leftImpl;
|
||||
RightEvaluator m_rightImpl;
|
||||
};
|
||||
|
||||
#define TileSizeDimM 32 // Tile size for dimension M
|
||||
#define TileSizeDimN 32 // Tile size for dimension N
|
||||
#define TileSizeDimK 16 // Tile size for dimension K
|
||||
#define WorkLoadPerThreadM 4 // Work load per thread in dimension M
|
||||
#define WorkLoadPerThreadN 4 // work load per thread in dimension N
|
||||
#define LocalThreadSizeM (TileSizeDimM/WorkLoadPerThreadM) // Local thread size for the first dimension (M here)
|
||||
#define LocalThreadSizeN (TileSizeDimN/WorkLoadPerThreadN) // Local thread size for the second dimension (N here)
|
||||
#define LoadPerThreadLhs ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)) // workload per thread for Lhs expression
|
||||
#define LoadPerThreadRhs ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)) // workload per thread for Rhs expression
|
||||
#define RoundUp(x,y) ((((x) + (y) - 1) / (y))*(y)) // RoundUp function to make sure that the global threadId is dividabe by local threadId
|
||||
|
||||
template <typename PLEXPR, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct KernelNameConstructor;
|
||||
template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
|
||||
template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
|
||||
static void Run(const Self& self, Output* buffer, Index M, Index N, Index K,
|
||||
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
|
||||
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
|
||||
// create a tuple of accessors from Evaluator
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<typename Self::XprType>::Type PlaceHolderExpr;
|
||||
typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
|
||||
auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
|
||||
Index roundUpK = RoundUp(K, TileSizeDimK);
|
||||
Index roundUpM = RoundUp(M, TileSizeDimM);
|
||||
Index roundUpN = RoundUp(N, TileSizeDimN);
|
||||
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
|
||||
// Local memory for elements of Lhs
|
||||
cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
|
||||
// Local memory for elements of Rhs
|
||||
cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
|
||||
//Output memory
|
||||
auto out_privateRes= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
// sycl parallel for
|
||||
cgh.parallel_for<KernelName>( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) {
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<typename Self::XprType>::Type DevExpr;
|
||||
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
|
||||
ContractT, 1,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, Unaligned, MakeGlobalPointer> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
typename DevEvaluator::RightEvaluator, RightNocontractT,
|
||||
ContractT, 1,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
|
||||
// initialize data mappers must happen inside the kernel for device eval
|
||||
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
|
||||
auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes);
|
||||
// Matmul Kernel
|
||||
// Thread identifiers
|
||||
const int mLocalThreadId = itemID.get_local(0); // Local ID row
|
||||
const int nLocalThreadId = itemID.get_local(1); // Local ID col
|
||||
const int mGroupId = itemID.get_group(0); // Work-group ID row
|
||||
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
|
||||
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
|
||||
// Allocate register space
|
||||
float privateLhs;
|
||||
float privateRhs[WorkLoadPerThreadN];
|
||||
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
|
||||
// Initialise the privateResumulation registers
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Tile Lhs
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int
|
||||
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
|
||||
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<Output>(0);
|
||||
}
|
||||
// Tile Rhs
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
|
||||
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<Output>(0);
|
||||
|
||||
}
|
||||
// Loop over all tiles
|
||||
const int numTiles = roundUpK/TileSizeDimK;
|
||||
int firstHalf=0;
|
||||
do {
|
||||
// Synchronise
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
// Load the next tile of Lhs and Rhs into local memory
|
||||
int nextHalf = firstHalf + 1;
|
||||
if (nextHalf < numTiles) {
|
||||
// Tile A
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// global K id
|
||||
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
|
||||
// Store the loaded value into local memory
|
||||
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<Output>(0);
|
||||
}
|
||||
// Tile B
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
|
||||
// Store the loaded vector into local memory
|
||||
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<Output>(0);
|
||||
}
|
||||
}
|
||||
// Loop over the values of a single tile
|
||||
for (int k=0; k<TileSizeDimK; k++) {
|
||||
// Cache the values of localRhs in registers
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
|
||||
}
|
||||
// Perform the computation
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Next tile
|
||||
firstHalf++;
|
||||
} while (firstHalf<numTiles);
|
||||
|
||||
|
||||
// Store the final results in C
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
if (globalRow< M){
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
if(globalCol<N)
|
||||
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// End the kernel
|
||||
});
|
||||
});
|
||||
self.device().synchronize();
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
|
@ -31,7 +31,7 @@ namespace Eigen {
|
||||
auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
|
||||
auto globalid = itemID.get_global_linear_id();
|
||||
if (globalid < m_rng) {
|
||||
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
|
||||
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
|
||||
}
|
||||
}
|
||||
|
||||
@ -50,7 +50,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device
|
||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
||||
auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
|
||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
||||
if((*it).is_cpu() && s.find("amd")!=std::string::npos){
|
||||
if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp
|
||||
it=devices.erase(it);
|
||||
}
|
||||
else{
|
||||
@ -72,9 +72,9 @@ struct QueueInterface {
|
||||
mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
|
||||
/// sycl queue
|
||||
mutable cl::sycl::queue m_queue;
|
||||
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured throufh dev_Selector typename
|
||||
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
|
||||
/// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
|
||||
template<typename dev_Selector> explicit QueueInterface(dev_Selector s):
|
||||
template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
|
||||
for (const auto& e : l) {
|
||||
@ -103,17 +103,21 @@ struct QueueInterface {
|
||||
auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
|
||||
buf.set_final_data(nullptr);
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(ptr,buf));
|
||||
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
|
||||
return static_cast<void*>(ptr);
|
||||
}
|
||||
|
||||
/// This is used to deallocate the device pointer. p is used as a key inside
|
||||
/// the map to find the device buffer and delete it.
|
||||
EIGEN_STRONG_INLINE void deallocate(const void *p) const {
|
||||
EIGEN_STRONG_INLINE void deallocate(void *p) const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
auto it = buffer_map.find(static_cast<const uint8_t*>(p));
|
||||
if (it != buffer_map.end()) {
|
||||
auto num_bytes =it->second.get_size();
|
||||
buffer_map.erase(it);
|
||||
// Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version
|
||||
std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1>
|
||||
a1.deallocate(static_cast<uint8_t*>(p), num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@ -188,7 +192,7 @@ struct SyclDevice {
|
||||
return m_queue_stream->allocate(num_bytes);
|
||||
}
|
||||
/// deallocate device memory
|
||||
EIGEN_STRONG_INLINE void deallocate(const void *p) const {
|
||||
EIGEN_STRONG_INLINE void deallocate(void *p) const {
|
||||
m_queue_stream->deallocate(p);
|
||||
}
|
||||
|
||||
@ -235,25 +239,25 @@ struct SyclDevice {
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
|
||||
// Assuming that the dst is the start of the destination pointer
|
||||
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(rng*sizeof(T)));
|
||||
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
|
||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
||||
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0));
|
||||
});
|
||||
synchronize();
|
||||
}
|
||||
/// returning the sycl queue
|
||||
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
|
||||
/// Here is the implementation of memset function on sycl.
|
||||
template<typename T> EIGEN_STRONG_INLINE void memset(T *buff, int c, size_t n) const {
|
||||
template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const {
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
|
||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(buff))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
if (globalid< n) {
|
||||
if (globalid< rng) {
|
||||
for(size_t i=0; i<sizeof(T); i++)
|
||||
buf_acc[globalid*sizeof(T) + i] = c;
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ struct TensorEvaluator
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
typedef Derived XprType;
|
||||
|
||||
// NumDimensions is -1 for variable dim tensors
|
||||
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
|
||||
@ -152,6 +153,8 @@ struct TensorEvaluator<const Derived, Device>
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
typedef const Derived XprType;
|
||||
|
||||
|
||||
// NumDimensions is -1 for variable dim tensors
|
||||
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
|
||||
|
@ -80,5 +80,7 @@ template<typename T> struct GetType<false, T>{
|
||||
//sycl functors
|
||||
#include "TensorSyclFunctors.h"
|
||||
|
||||
#include "TensorContractionSycl.h"
|
||||
|
||||
#endif // end of EIGEN_USE_SYCL
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
||||
|
@ -135,21 +135,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const)
|
||||
KERNELBROKERCONVERTERSLICESTRIDEOP()
|
||||
#undef KERNELBROKERCONVERTERSLICESTRIDEOP
|
||||
|
||||
#define KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(OPEXPR, CVQual)\
|
||||
template<typename Param, typename XprType>\
|
||||
struct ConvertToDeviceExpression<CVQual OPEXPR <Param, XprType> >{\
|
||||
typedef CVQual OPEXPR<Param, typename ConvertToDeviceExpression<XprType>::Type> Type;\
|
||||
};
|
||||
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, const)
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, )
|
||||
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, const)
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, )
|
||||
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, const)
|
||||
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, )
|
||||
#undef KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP
|
||||
|
||||
} // namespace internal
|
||||
} // namespace TensorSycl
|
||||
|
@ -223,26 +223,6 @@ SYCLSLICESTRIDEOPEXTACC()
|
||||
#undef SYCLSLICESTRIDEOPEXTACC
|
||||
|
||||
|
||||
#define PADDINGRESHAPEANDSHUFFOPEXTRACC(OPEXPR, CVQual)\
|
||||
template<typename Param, typename XprType, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual OPEXPR<Param, XprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual OPEXPR<Param, XprType>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.impl());\
|
||||
}\
|
||||
};
|
||||
|
||||
// tensor padding
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, )
|
||||
// tensor reshaping
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, )
|
||||
/// Tensor shuffling
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, )
|
||||
#undef PADDINGRESHAPEANDSHUFFOPEXTRACC
|
||||
|
||||
/// template deduction for \ref ExtractAccessor
|
||||
template <typename Evaluator>
|
||||
auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
|
||||
|
@ -58,7 +58,7 @@ SYCLEXTRTENSORMAPFIXEDSIZE()
|
||||
template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
|
||||
OP func;\
|
||||
const OP func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
|
||||
: rhsExpr(expr.impl()), func(expr.functor()) {}\
|
||||
};
|
||||
@ -74,7 +74,7 @@ template <template<class, class, class> class BinaryCategory, typename OP, typen
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
|
||||
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
|
||||
OP func;\
|
||||
const OP func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
|
||||
};
|
||||
@ -90,7 +90,7 @@ struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg
|
||||
FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
|
||||
FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
|
||||
FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
|
||||
OP func;\
|
||||
const OP func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
|
||||
: arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
|
||||
};
|
||||
@ -241,6 +241,23 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
|
||||
PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
|
||||
#undef PADDINGOPFUNCEXT
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type.
|
||||
#define SYCLEXTRFUNCCONTRACT(CVQual)\
|
||||
template <typename Indices, typename LHSExpr, typename RHSExpr, typename Dev>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev> > {\
|
||||
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
|
||||
const Indices func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev>& expr)\
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\
|
||||
};
|
||||
|
||||
SYCLEXTRFUNCCONTRACT(const)
|
||||
SYCLEXTRFUNCCONTRACT()
|
||||
#undef SYCLEXTRFUNCCONTRACT
|
||||
|
||||
|
||||
/// template deduction function for FunctorExtractor
|
||||
template <typename Evaluator>
|
||||
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
|
||||
|
@ -132,19 +132,6 @@ SLICESTRIDEOPLEAFCOUNT(const)
|
||||
SLICESTRIDEOPLEAFCOUNT()
|
||||
#undef SLICESTRIDEOPLEAFCOUNT
|
||||
|
||||
#define PADDINGRESHAPEANDSHUFFLELEAFCOUNT(OPEXPR, CVQual)\
|
||||
template<typename Param, typename XprType>\
|
||||
struct LeafCount<CVQual OPEXPR<Param, XprType> >:CategoryCount<XprType>{};
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, )
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, )
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, )
|
||||
#undef PADDINGRESHAPEANDSHUFFLELEAFCOUNT
|
||||
|
||||
} /// namespace TensorSycl
|
||||
} /// namespace internal
|
||||
|
@ -191,21 +191,6 @@ SYCLSLICESTRIDEOPPLH(const)
|
||||
SYCLSLICESTRIDEOPPLH()
|
||||
#undef SYCLSLICESTRIDEOPPLH
|
||||
|
||||
#define PADDINGRESHAPEANDSHUFFLEOPPLH(OPEXP , CVQual)\
|
||||
template<typename Param, typename XprType, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual OPEXP<Param, XprType>, N > {\
|
||||
typedef CVQual OPEXP<Param, typename CalculateIndex<N, XprType>::ArgType> Type;\
|
||||
};
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp,)
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, )
|
||||
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp, const)
|
||||
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp,)
|
||||
#undef PADDINGRESHAPEANDSHUFFLEOPPLH
|
||||
|
||||
/// template deduction for \ref PlaceHolderExpression struct
|
||||
template <typename Expr>
|
||||
|
@ -56,7 +56,6 @@ void run(Expr &expr, Dev &dev) {
|
||||
});
|
||||
dev.synchronize();
|
||||
}
|
||||
|
||||
evaluator.cleanup();
|
||||
}
|
||||
} // namespace TensorSycl
|
||||
|
@ -150,6 +150,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
# It should be safe to always run these tests as there is some fallback code for
|
||||
# older compiler that don't support cxx11.
|
||||
|
@ -154,7 +154,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
|
||||
Tensor<SCALAR, 3> out(tensorRange); \
|
||||
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
|
||||
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
|
||||
out = out.random() + static_cast<SCALAR>(0.01); \
|
||||
Tensor<SCALAR, 3> reference(out); \
|
||||
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
|
||||
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
|
||||
@ -169,8 +168,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
|
||||
(in_1.size()) * sizeof(SCALAR)); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
|
||||
(in_2.size()) * sizeof(SCALAR)); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
@ -192,7 +189,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
|
||||
Tensor<SCALAR, 3> out(tensorRange); \
|
||||
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
|
||||
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
|
||||
out = out.random() + static_cast<SCALAR>(0.01); \
|
||||
Tensor<SCALAR, 3> reference(out); \
|
||||
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
|
||||
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
|
||||
@ -207,8 +203,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
|
||||
(in_1.size()) * sizeof(SCALAR)); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
|
||||
(in_2.size()) * sizeof(SCALAR)); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
@ -235,8 +229,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
|
||||
TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
|
||||
(in_1.size()) * sizeof(SCALAR)); \
|
||||
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
|
||||
(out.size()) * sizeof(SCALAR)); \
|
||||
|
218
unsupported/test/cxx11_tensor_contract_sycl.cpp
Normal file
218
unsupported/test/cxx11_tensor_contract_sycl.cpp
Normal file
@ -0,0 +1,218 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::array;
|
||||
using Eigen::SyclDevice;
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
{
|
||||
// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result(m_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
|
||||
// Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
|
||||
Eigen::array<int, 2> left_dims = {{m_size, k_size}};
|
||||
Eigen::array<int, 2> right_dims = {{k_size, n_size}};
|
||||
Eigen::array<int, 2> result_dims = {{m_size, n_size}};
|
||||
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size() * sizeof(float);
|
||||
|
||||
float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
|
||||
float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
|
||||
float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims);
|
||||
|
||||
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
|
||||
|
||||
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
|
||||
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
sycl_device.deallocate(d_t_left);
|
||||
sycl_device.deallocate(d_t_right);
|
||||
sycl_device.deallocate(d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
{
|
||||
//std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 0, DataLayout> t_result;
|
||||
Tensor<float, 0, DataLayout> t_result_gpu;
|
||||
Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
|
||||
Eigen::array<int, 2> left_dims = {{m_size, k_size}};
|
||||
Eigen::array<int, 2> right_dims = {{k_size, n_size}};
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = sizeof(float);
|
||||
|
||||
|
||||
float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
|
||||
float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
|
||||
float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result);
|
||||
|
||||
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
|
||||
|
||||
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
<< " vs " << t_result_gpu() << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
sycl_device.deallocate(d_t_left);
|
||||
sycl_device.deallocate(d_t_right);
|
||||
sycl_device.deallocate(d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction_m(const Device& sycl_device) {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction_k(const Device& sycl_device) {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction_n(const Device& sycl_device) {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction_sizes(const Device& sycl_device) {
|
||||
int m_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257 , 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int n_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257, 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int k_sizes[] = { 31, 39, 63, 64, 65,
|
||||
95, 96, 127, 129, 255,
|
||||
257, 511, 512, 513, 1023,
|
||||
1024, 1025};
|
||||
|
||||
for (int i = 0; i < 15; i++) {
|
||||
for (int j = 0; j < 15; j++) {
|
||||
for (int k = 0; k < 17; k++) {
|
||||
test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device=Eigen::SyclDevice(&queueInterface);
|
||||
test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32);
|
||||
test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32);
|
||||
test_scalar<ColMajor>(sycl_device, 32, 32, 32);
|
||||
test_scalar<RowMajor>(sycl_device, 32, 32, 32);
|
||||
std::chrono::time_point<std::chrono::system_clock> start, end;
|
||||
start = std::chrono::system_clock::now();
|
||||
test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128);
|
||||
test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128);
|
||||
test_scalar<ColMajor>(sycl_device, 128, 128, 128);
|
||||
test_scalar<RowMajor>(sycl_device, 128, 128, 128);
|
||||
test_sycl_contraction_m<ColMajor>(sycl_device);
|
||||
test_sycl_contraction_m<RowMajor>(sycl_device);
|
||||
test_sycl_contraction_n<ColMajor>(sycl_device);
|
||||
test_sycl_contraction_n<RowMajor>(sycl_device);
|
||||
test_sycl_contraction_k<ColMajor>(sycl_device);
|
||||
test_sycl_contraction_k<RowMajor>(sycl_device);
|
||||
test_sycl_contraction_sizes<ColMajor>(sycl_device);
|
||||
test_sycl_contraction_sizes<RowMajor>(sycl_device);
|
||||
end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> elapsed_seconds = end-start;
|
||||
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
|
||||
std::cout << "finished computation at " << std::ctime(&end_time)
|
||||
<< "elapsed time: " << elapsed_seconds.count() << "s\n";
|
||||
|
||||
}
|
||||
void test_cxx11_tensor_contract_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(tensorContractionPerDevice(device));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user