Adding tensor contraction operation backend for Sycl; adding test for contractionOp sycl backend; adding temporary solution to prevent memory leak in buffer; cleaning up cxx11_tensor_buildins_sycl.h

This commit is contained in:
Mehdi Goli 2016-12-14 15:30:37 +00:00
parent 3d59a47720
commit 2d4a091beb
15 changed files with 648 additions and 110 deletions

View File

@ -156,9 +156,9 @@ struct TensorContractionEvaluatorBase
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.rhsExpression(), op.lhsExpression()), device),
m_device(device),
m_result(NULL) {
m_result(NULL), m_expr_indices(op.indices()) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@ -327,7 +327,7 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
@ -564,6 +564,9 @@ struct TensorContractionEvaluatorBase
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
/// required for sycl
const Indices m_expr_indices;
};
@ -621,6 +624,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
};
} // end namespace Eigen

View File

@ -22,8 +22,14 @@ enum {
/*
* Implementation of the Eigen blas_data_mapper class for tensors.
*/
/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
/// is scalar * for CoeffLoader.
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader;
template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper;
template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader {
enum {
DirectOffsets = false
};
@ -47,7 +53,7 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
const Tensor m_tensor;
};
template <typename Tensor> struct CoeffLoader<Tensor, true> {
template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> {
enum {
DirectOffsets = true
};
@ -67,13 +73,14 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
}
private:
typedef typename Tensor::Scalar Scalar;
const Scalar* m_data;
typename MakePointer_<const Scalar>::Type m_data;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous, int Alignment>
int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
class SimpleTensorContractionMapper {
public:
EIGEN_DEVICE_FUNC
@ -89,7 +96,7 @@ class SimpleTensorContractionMapper {
m_k_strides(k_strides) { }
enum {
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@ -206,23 +213,22 @@ class SimpleTensorContractionMapper {
}
protected:
CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
const nocontract_t m_nocontract_strides;
const nocontract_t m_ij_strides;
const contract_t m_contract_strides;
const contract_t m_k_strides;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous,
bool inner_dim_reordered, int Alignment>
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
{
public:
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@ -307,11 +313,11 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous,
bool inner_dim_reordered, int Alignment>
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
{
public:
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@ -345,14 +351,14 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
class TensorContractionSubMapper {
public:
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
typedef Self LinearMapper;
enum {
@ -452,14 +458,14 @@ template<typename Scalar_, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
class TensorContractionInputMapper
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
public:
typedef Scalar_ Scalar;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
typedef SubMapper VectorMapper;
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,

View File

@ -0,0 +1,355 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* TensorSyclConvertToDeviceExpression.h
*
* \brief:
* TensorContractionsycl
*
*****************************************************************/
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
namespace Eigen {
template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
typedef const Eigen::SyclDevice Device;
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, LDims> left_dim_mapper_t;
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, LDims - ContractDims> left_nocontract_t;
typedef array<Index, RDims - ContractDims> right_nocontract_t;
static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef DSizes<Index, NumDims> Dimensions;
// typedefs needed in evalTo
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
typedef typename LeftEvaluator::Dimensions LeftDimensions;
typedef typename RightEvaluator::Dimensions RightDimensions;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
// We need to redefine this method to make nvcc happy
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
evalTo(data);
return false;
} else {
this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
evalTo(this->m_result);
return true;
}
}
const Eigen::SyclDevice& device() const {return this->m_device;}
void evalTo(Scalar* buffer) const {
// Here is the result
if (this->m_lhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<true, true, true, Unaligned>(buffer);
}
else {
evalTyped<true, true, false, Unaligned>(buffer);
}
}
else {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<true, false, true, Unaligned>(buffer);
}
else {
evalTyped<true, false, false, Unaligned>(buffer);
}
}
}
else {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<false, true, true, Unaligned>(buffer);
}
else {
evalTyped<false, true, false, Unaligned>(buffer);
}
}
else {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<false, false, true, Unaligned>(buffer);
}
else {
evalTyped<false, false, false, Unaligned>(buffer);
}
}
}
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalTyped(Scalar* buffer) const {
// columns in left side, rows in right side
const Index k = this->m_k_size;
EIGEN_UNUSED_VARIABLE(k)
// rows in left side
const Index m = this->m_i_size;
// columns in right side
const Index n = this->m_j_size;
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
}
// required by sycl to construct the expr on the device. Returns original left_impl
const TensorEvaluator<LeftArgType, Device>& left_impl() const {
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
}
// required by sycl to construct the expr on the device. Returns original right_impl
const TensorEvaluator<RightArgType, Device>& right_impl() const {
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
}
// required by sycl to construct the expr on the device
const Indices& indices() const {return this->m_expr_indices;}
};
/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
template<typename Expr> struct TensorEvaluatorContainer;
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
typedef Eigen::DefaultDevice Device;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
: m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.lhsExpression(), op.rhsExpression()), device),
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.rhsExpression(), op.lhsExpression()), device){}
LeftEvaluator m_leftImpl;
RightEvaluator m_rightImpl;
};
#define TileSizeDimM 32 // Tile size for dimension M
#define TileSizeDimN 32 // Tile size for dimension N
#define TileSizeDimK 16 // Tile size for dimension K
#define WorkLoadPerThreadM 4 // Work load per thread in dimension M
#define WorkLoadPerThreadN 4 // work load per thread in dimension N
#define LocalThreadSizeM (TileSizeDimM/WorkLoadPerThreadM) // Local thread size for the first dimension (M here)
#define LocalThreadSizeN (TileSizeDimN/WorkLoadPerThreadN) // Local thread size for the second dimension (N here)
#define LoadPerThreadLhs ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)) // workload per thread for Lhs expression
#define LoadPerThreadRhs ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)) // workload per thread for Rhs expression
#define RoundUp(x,y) ((((x) + (y) - 1) / (y))*(y)) // RoundUp function to make sure that the global threadId is dividabe by local threadId
template <typename PLEXPR, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct KernelNameConstructor;
template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
static void Run(const Self& self, Output* buffer, Index M, Index N, Index K,
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
// create a tuple of accessors from Evaluator
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<typename Self::XprType>::Type PlaceHolderExpr;
typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
// Local memory for elements of Lhs
cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
// Local memory for elements of Rhs
cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
//Output memory
auto out_privateRes= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
// sycl parallel for
cgh.parallel_for<KernelName>( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) {
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<typename Self::XprType>::Type DevExpr;
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
ContractT, 1,
lhs_inner_dim_contiguous,
false, Unaligned, MakeGlobalPointer> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
typename DevEvaluator::RightEvaluator, RightNocontractT,
ContractT, 1,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
// initialize data mappers must happen inside the kernel for device eval
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes);
// Matmul Kernel
// Thread identifiers
const int mLocalThreadId = itemID.get_local(0); // Local ID row
const int nLocalThreadId = itemID.get_local(1); // Local ID col
const int mGroupId = itemID.get_group(0); // Work-group ID row
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
// Allocate register space
float privateLhs;
float privateRhs[WorkLoadPerThreadN];
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
// Initialise the privateResumulation registers
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] = 0.0f;
}
}
// Tile Lhs
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// Load the value (wide vector load)
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<Output>(0);
}
// Tile Rhs
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<Output>(0);
}
// Loop over all tiles
const int numTiles = roundUpK/TileSizeDimK;
int firstHalf=0;
do {
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// Load the next tile of Lhs and Rhs into local memory
int nextHalf = firstHalf + 1;
if (nextHalf < numTiles) {
// Tile A
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// global K id
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
// Store the loaded value into local memory
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<Output>(0);
}
// Tile B
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
// Store the loaded vector into local memory
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<Output>(0);
}
}
// Loop over the values of a single tile
for (int k=0; k<TileSizeDimK; k++) {
// Cache the values of localRhs in registers
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
}
// Perform the computation
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
}
}
}
// Next tile
firstHalf++;
} while (firstHalf<numTiles);
// Store the final results in C
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
if (globalRow< M){
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
}
}
}
/// End the kernel
});
});
self.device().synchronize();
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H

View File

@ -31,7 +31,7 @@ namespace Eigen {
auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
auto globalid = itemID.get_global_linear_id();
if (globalid < m_rng) {
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
}
}
@ -50,7 +50,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if((*it).is_cpu() && s.find("amd")!=std::string::npos){
if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp
it=devices.erase(it);
}
else{
@ -72,9 +72,9 @@ struct QueueInterface {
mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
/// sycl queue
mutable cl::sycl::queue m_queue;
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured throufh dev_Selector typename
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
/// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
template<typename dev_Selector> explicit QueueInterface(dev_Selector s):
template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
#ifdef EIGEN_EXCEPTIONS
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
for (const auto& e : l) {
@ -103,17 +103,21 @@ struct QueueInterface {
auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
buf.set_final_data(nullptr);
std::lock_guard<std::mutex> lock(mutex_);
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(ptr,buf));
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
return static_cast<void*>(ptr);
}
/// This is used to deallocate the device pointer. p is used as a key inside
/// the map to find the device buffer and delete it.
EIGEN_STRONG_INLINE void deallocate(const void *p) const {
EIGEN_STRONG_INLINE void deallocate(void *p) const {
std::lock_guard<std::mutex> lock(mutex_);
auto it = buffer_map.find(static_cast<const uint8_t*>(p));
if (it != buffer_map.end()) {
auto num_bytes =it->second.get_size();
buffer_map.erase(it);
// Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version
std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1>
a1.deallocate(static_cast<uint8_t*>(p), num_bytes);
}
}
@ -188,7 +192,7 @@ struct SyclDevice {
return m_queue_stream->allocate(num_bytes);
}
/// deallocate device memory
EIGEN_STRONG_INLINE void deallocate(const void *p) const {
EIGEN_STRONG_INLINE void deallocate(void *p) const {
m_queue_stream->deallocate(p);
}
@ -235,25 +239,25 @@ struct SyclDevice {
size_t rng, GRange, tileSize;
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
// Assuming that the dst is the start of the destination pointer
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(rng*sizeof(T)));
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0));
});
synchronize();
}
/// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
/// Here is the implementation of memset function on sycl.
template<typename T> EIGEN_STRONG_INLINE void memset(T *buff, int c, size_t n) const {
template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const {
size_t rng, GRange, tileSize;
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(buff))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
if (globalid< n) {
if (globalid< rng) {
for(size_t i=0; i<sizeof(T); i++)
buf_acc[globalid*sizeof(T) + i] = c;
}

View File

@ -32,6 +32,7 @@ struct TensorEvaluator
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef Derived XprType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@ -152,6 +153,8 @@ struct TensorEvaluator<const Derived, Device>
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef const Derived XprType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?

View File

@ -80,5 +80,7 @@ template<typename T> struct GetType<false, T>{
//sycl functors
#include "TensorSyclFunctors.h"
#include "TensorContractionSycl.h"
#endif // end of EIGEN_USE_SYCL
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H

View File

@ -135,21 +135,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const)
KERNELBROKERCONVERTERSLICESTRIDEOP()
#undef KERNELBROKERCONVERTERSLICESTRIDEOP
#define KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(OPEXPR, CVQual)\
template<typename Param, typename XprType>\
struct ConvertToDeviceExpression<CVQual OPEXPR <Param, XprType> >{\
typedef CVQual OPEXPR<Param, typename ConvertToDeviceExpression<XprType>::Type> Type;\
};
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, const)
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, )
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, const)
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, )
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, const)
KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, )
#undef KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP
} // namespace internal
} // namespace TensorSycl

View File

@ -223,26 +223,6 @@ SYCLSLICESTRIDEOPEXTACC()
#undef SYCLSLICESTRIDEOPEXTACC
#define PADDINGRESHAPEANDSHUFFOPEXTRACC(OPEXPR, CVQual)\
template<typename Param, typename XprType, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual OPEXPR<Param, XprType>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual OPEXPR<Param, XprType>, Dev>& eval)\
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
return AccessorConstructor::getTuple(cgh, eval.impl());\
}\
};
// tensor padding
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, const)
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, )
// tensor reshaping
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, const)
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, )
/// Tensor shuffling
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, const)
PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, )
#undef PADDINGRESHAPEANDSHUFFOPEXTRACC
/// template deduction for \ref ExtractAccessor
template <typename Evaluator>
auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)

View File

@ -58,7 +58,7 @@ SYCLEXTRTENSORMAPFIXEDSIZE()
template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
OP func;\
const OP func;\
FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
: rhsExpr(expr.impl()), func(expr.functor()) {}\
};
@ -74,7 +74,7 @@ template <template<class, class, class> class BinaryCategory, typename OP, typen
struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
OP func;\
const OP func;\
FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
};
@ -90,7 +90,7 @@ struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg
FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
OP func;\
const OP func;\
FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
: arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
};
@ -241,6 +241,23 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
#undef PADDINGOPFUNCEXT
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type.
#define SYCLEXTRFUNCCONTRACT(CVQual)\
template <typename Indices, typename LHSExpr, typename RHSExpr, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev> > {\
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
const Indices func;\
FunctorExtractor(const TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev>& expr)\
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\
};
SYCLEXTRFUNCCONTRACT(const)
SYCLEXTRFUNCCONTRACT()
#undef SYCLEXTRFUNCCONTRACT
/// template deduction function for FunctorExtractor
template <typename Evaluator>
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {

View File

@ -132,19 +132,6 @@ SLICESTRIDEOPLEAFCOUNT(const)
SLICESTRIDEOPLEAFCOUNT()
#undef SLICESTRIDEOPLEAFCOUNT
#define PADDINGRESHAPEANDSHUFFLELEAFCOUNT(OPEXPR, CVQual)\
template<typename Param, typename XprType>\
struct LeafCount<CVQual OPEXPR<Param, XprType> >:CategoryCount<XprType>{};
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, const)
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, )
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, const)
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, )
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, const)
PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, )
#undef PADDINGRESHAPEANDSHUFFLELEAFCOUNT
} /// namespace TensorSycl
} /// namespace internal

View File

@ -191,21 +191,6 @@ SYCLSLICESTRIDEOPPLH(const)
SYCLSLICESTRIDEOPPLH()
#undef SYCLSLICESTRIDEOPPLH
#define PADDINGRESHAPEANDSHUFFLEOPPLH(OPEXP , CVQual)\
template<typename Param, typename XprType, size_t N>\
struct PlaceHolderExpression<CVQual OPEXP<Param, XprType>, N > {\
typedef CVQual OPEXP<Param, typename CalculateIndex<N, XprType>::ArgType> Type;\
};
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp, const)
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp,)
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, const)
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, )
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp, const)
PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp,)
#undef PADDINGRESHAPEANDSHUFFLEOPPLH
/// template deduction for \ref PlaceHolderExpression struct
template <typename Expr>

View File

@ -56,7 +56,6 @@ void run(Expr &expr, Dev &dev) {
});
dev.synchronize();
}
evaluator.cleanup();
}
} // namespace TensorSycl

View File

@ -150,6 +150,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.

View File

@ -154,7 +154,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
Tensor<SCALAR, 3> out(tensorRange); \
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
out = out.random() + static_cast<SCALAR>(0.01); \
Tensor<SCALAR, 3> reference(out); \
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
@ -169,8 +168,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
(in_1.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
(in_2.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
(out.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
@ -192,7 +189,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
Tensor<SCALAR, 3> out(tensorRange); \
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
out = out.random() + static_cast<SCALAR>(0.01); \
Tensor<SCALAR, 3> reference(out); \
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
@ -207,8 +203,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
(in_1.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
(in_2.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
(out.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
@ -235,8 +229,6 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
(in_1.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
(out.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \

View File

@ -0,0 +1,218 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include <iostream>
#include <chrono>
#include <ctime>
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::array;
using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout, typename Device>
void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
{
// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
Tensor<float, 2, DataLayout> t_result(m_size, n_size);
Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
// Eigen::array<DimPair, 1> dims(DimPair(1, 0));
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
Eigen::array<int, 2> left_dims = {{m_size, k_size}};
Eigen::array<int, 2> right_dims = {{k_size, n_size}};
Eigen::array<int, 2> result_dims = {{m_size, n_size}};
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims);
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
continue;
}
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
continue;
}
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
<< " vs " << t_result_gpu(i) << std::endl;
assert(false);
}
sycl_device.deallocate(d_t_left);
sycl_device.deallocate(d_t_right);
sycl_device.deallocate(d_t_result);
}
template<int DataLayout, typename Device>
void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
{
//std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
Tensor<float, 0, DataLayout> t_result;
Tensor<float, 0, DataLayout> t_result_gpu;
Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
Eigen::array<int, 2> left_dims = {{m_size, k_size}};
Eigen::array<int, 2> right_dims = {{k_size, n_size}};
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = sizeof(float);
float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result);
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result()
<< " vs " << t_result_gpu() << std::endl;
assert(false);
}
sycl_device.deallocate(d_t_left);
sycl_device.deallocate(d_t_right);
sycl_device.deallocate(d_t_result);
}
template<int DataLayout, typename Device>
void test_sycl_contraction_m(const Device& sycl_device) {
for (int k = 32; k < 256; k++) {
test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128);
}
}
template<int DataLayout, typename Device>
void test_sycl_contraction_k(const Device& sycl_device) {
for (int k = 32; k < 256; k++) {
test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128);
}
}
template<int DataLayout, typename Device>
void test_sycl_contraction_n(const Device& sycl_device) {
for (int k = 32; k < 256; k++) {
test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k);
}
}
template<int DataLayout, typename Device>
void test_sycl_contraction_sizes(const Device& sycl_device) {
int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025};
int n_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257, 511,
512, 513, 1023, 1024, 1025};
int k_sizes[] = { 31, 39, 63, 64, 65,
95, 96, 127, 129, 255,
257, 511, 512, 513, 1023,
1024, 1025};
for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) {
test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
}
}
}
}
template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
QueueInterface queueInterface(s);
auto sycl_device=Eigen::SyclDevice(&queueInterface);
test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32);
test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32);
test_scalar<ColMajor>(sycl_device, 32, 32, 32);
test_scalar<RowMajor>(sycl_device, 32, 32, 32);
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();
test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128);
test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128);
test_scalar<ColMajor>(sycl_device, 128, 128, 128);
test_scalar<RowMajor>(sycl_device, 128, 128, 128);
test_sycl_contraction_m<ColMajor>(sycl_device);
test_sycl_contraction_m<RowMajor>(sycl_device);
test_sycl_contraction_n<ColMajor>(sycl_device);
test_sycl_contraction_n<RowMajor>(sycl_device);
test_sycl_contraction_k<ColMajor>(sycl_device);
test_sycl_contraction_k<RowMajor>(sycl_device);
test_sycl_contraction_sizes<ColMajor>(sycl_device);
test_sycl_contraction_sizes<RowMajor>(sycl_device);
end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "finished computation at " << std::ctime(&end_time)
<< "elapsed time: " << elapsed_seconds.count() << "s\n";
}
void test_cxx11_tensor_contract_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(tensorContractionPerDevice(device));
}
}