mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 01:59:38 +08:00
Converting all parallel for lambda to functor in order to prevent kernel duplication name error; adding tensorConcatinationOp backend for sycl.
This commit is contained in:
parent
7949849ebc
commit
35bae513a0
@ -276,6 +276,12 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const Axis& axis() const { return m_axis; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
|
@ -190,7 +190,159 @@ LeftEvaluator m_leftImpl;
|
||||
RightEvaluator m_rightImpl;
|
||||
};
|
||||
|
||||
template <typename PLEXPR, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct KernelNameConstructor;
|
||||
|
||||
template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
|
||||
typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
|
||||
int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
|
||||
int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
|
||||
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
|
||||
FunctorExpr functors;
|
||||
LhsLocalAcc localLhs;
|
||||
RhsLocalAcc localRhs;
|
||||
OutAccessor out_res;
|
||||
Index roundUpK, M, N, K;
|
||||
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
|
||||
LeftNocontractT m_i_strides, m_left_nocontract_strides;
|
||||
RightNocontractT m_j_strides, m_right_nocontract_strides;
|
||||
TupleType tuple_of_accessors;
|
||||
|
||||
KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
|
||||
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
|
||||
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
|
||||
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
|
||||
:functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
|
||||
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
|
||||
m_right_contracting_strides(m_right_contracting_strides_),
|
||||
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
|
||||
m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
|
||||
tuple_of_accessors(tuple_of_accessors_){}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
|
||||
ContractT, 1,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, Unaligned, MakeGlobalPointer> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
typename DevEvaluator::RightEvaluator, RightNocontractT,
|
||||
ContractT, 1,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
|
||||
// initialize data mappers must happen inside the kernel for device eval
|
||||
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
|
||||
auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
|
||||
// Matmul Kernel
|
||||
// Thread identifiers
|
||||
const int mLocalThreadId = itemID.get_local(0); // Local ID row
|
||||
const int nLocalThreadId = itemID.get_local(1); // Local ID col
|
||||
const int mGroupId = itemID.get_group(0); // Work-group ID row
|
||||
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
|
||||
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
|
||||
// Allocate register space
|
||||
float privateLhs;
|
||||
float privateRhs[WorkLoadPerThreadN];
|
||||
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
|
||||
// Initialise the privateResumulation registers
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Tile Lhs
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int
|
||||
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
|
||||
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
|
||||
}
|
||||
// Tile Rhs
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
|
||||
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
|
||||
|
||||
}
|
||||
// Loop over all tiles
|
||||
const int numTiles = roundUpK/TileSizeDimK;
|
||||
int firstHalf=0;
|
||||
do {
|
||||
// Synchronise
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
// Load the next tile of Lhs and Rhs into local memory
|
||||
int nextHalf = firstHalf + 1;
|
||||
if (nextHalf < numTiles) {
|
||||
// Tile A
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// global K id
|
||||
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
|
||||
// Store the loaded value into local memory
|
||||
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
|
||||
}
|
||||
// Tile B
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
|
||||
// Store the loaded vector into local memory
|
||||
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
|
||||
}
|
||||
}
|
||||
// Loop over the values of a single tile
|
||||
for (int k=0; k<TileSizeDimK; k++) {
|
||||
// Cache the values of localRhs in registers
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
|
||||
}
|
||||
// Perform the computation
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Next tile
|
||||
firstHalf++;
|
||||
} while (firstHalf<numTiles);
|
||||
|
||||
|
||||
// Store the final results in C
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
if (globalRow< M){
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
if(globalCol<N)
|
||||
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
|
||||
|
||||
static const int TileSizeDimM = 32; // Tile size for dimension M
|
||||
@ -208,149 +360,39 @@ static int RoundUp(int x, int y) {
|
||||
return ((((x) + (y) - 1) / (y))*(y));
|
||||
}
|
||||
|
||||
template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
|
||||
static void Run(const Self& self, Output* buffer, Index M, Index N, Index K,
|
||||
template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
|
||||
static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
|
||||
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
|
||||
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
|
||||
// create a tuple of accessors from Evaluator
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<typename Self::XprType>::Type PlaceHolderExpr;
|
||||
typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
|
||||
typedef typename Self::XprType HostExpr;
|
||||
// typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
// typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
|
||||
auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
|
||||
typedef decltype(functors) FunctorExpr;
|
||||
Index roundUpK = RoundUp(K, TileSizeDimK);
|
||||
Index roundUpM = RoundUp(M, TileSizeDimM);
|
||||
Index roundUpN = RoundUp(N, TileSizeDimN);
|
||||
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
|
||||
typedef decltype(tuple_of_accessors) TupleType;
|
||||
// Local memory for elements of Lhs
|
||||
cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
|
||||
typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
|
||||
LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
|
||||
// Local memory for elements of Rhs
|
||||
cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
|
||||
//Output memory
|
||||
auto out_privateRes= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
|
||||
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
|
||||
//OutScalar memory
|
||||
auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
typedef decltype(out_res) OutAccessor;
|
||||
// sycl parallel for
|
||||
cgh.parallel_for<KernelName>( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) {
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<typename Self::XprType>::Type DevExpr;
|
||||
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
|
||||
ContractT, 1,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, Unaligned, MakeGlobalPointer> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
typename DevEvaluator::RightEvaluator, RightNocontractT,
|
||||
ContractT, 1,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
|
||||
// initialize data mappers must happen inside the kernel for device eval
|
||||
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
|
||||
auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes);
|
||||
// Matmul Kernel
|
||||
// Thread identifiers
|
||||
const int mLocalThreadId = itemID.get_local(0); // Local ID row
|
||||
const int nLocalThreadId = itemID.get_local(1); // Local ID col
|
||||
const int mGroupId = itemID.get_group(0); // Work-group ID row
|
||||
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
|
||||
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
|
||||
// Allocate register space
|
||||
float privateLhs;
|
||||
float privateRhs[WorkLoadPerThreadN];
|
||||
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
|
||||
// Initialise the privateResumulation registers
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Tile Lhs
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int
|
||||
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
|
||||
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<Output>(0);
|
||||
}
|
||||
// Tile Rhs
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
|
||||
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<Output>(0);
|
||||
|
||||
}
|
||||
// Loop over all tiles
|
||||
const int numTiles = roundUpK/TileSizeDimK;
|
||||
int firstHalf=0;
|
||||
do {
|
||||
// Synchronise
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
// Load the next tile of Lhs and Rhs into local memory
|
||||
int nextHalf = firstHalf + 1;
|
||||
if (nextHalf < numTiles) {
|
||||
// Tile A
|
||||
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
|
||||
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localLhsRow = localLhsLinearId% TileSizeDimM;
|
||||
int localLhsCol = localLhsLinearId/TileSizeDimM;
|
||||
// global K id
|
||||
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
|
||||
// Store the loaded value into local memory
|
||||
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<Output>(0);
|
||||
}
|
||||
// Tile B
|
||||
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
|
||||
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
|
||||
int localRhsRow = localRhsLinearId% TileSizeDimN;
|
||||
int localRhsCol = localRhsLinearId/TileSizeDimN;
|
||||
// Load the value (wide vector load)
|
||||
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
|
||||
// Store the loaded vector into local memory
|
||||
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<Output>(0);
|
||||
}
|
||||
}
|
||||
// Loop over the values of a single tile
|
||||
for (int k=0; k<TileSizeDimK; k++) {
|
||||
// Cache the values of localRhs in registers
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
|
||||
}
|
||||
// Perform the computation
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Next tile
|
||||
firstHalf++;
|
||||
} while (firstHalf<numTiles);
|
||||
|
||||
|
||||
// Store the final results in C
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
if (globalRow< M){
|
||||
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
if(globalCol<N)
|
||||
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// End the kernel
|
||||
});
|
||||
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
|
||||
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
|
||||
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
|
||||
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
|
||||
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
|
||||
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
|
||||
m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
|
||||
});
|
||||
self.device().asynchronousExec();
|
||||
}
|
||||
|
@ -43,6 +43,18 @@ namespace Eigen {
|
||||
size_t m_offset;
|
||||
};
|
||||
|
||||
struct memsetkernelFunctor{
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
|
||||
AccType m_acc;
|
||||
const size_t m_rng, m_c;
|
||||
memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
if (globalid< m_rng) m_acc[globalid] = m_c;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
|
||||
auto devices = cl::sycl::device::get_devices();
|
||||
std::vector<cl::sycl::device>::iterator it =devices.begin();
|
||||
@ -88,15 +100,17 @@ struct QueueInterface {
|
||||
}
|
||||
}
|
||||
}))
|
||||
#else
|
||||
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
|
||||
for (const auto& e : l) {
|
||||
if (e) {
|
||||
exception_caught_ = true;
|
||||
}
|
||||
}
|
||||
}))
|
||||
#endif
|
||||
#else
|
||||
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
|
||||
for (const auto& e : l) {
|
||||
if (e) {
|
||||
exception_caught_ = true;
|
||||
std::cerr << "Error detected Inside Sycl Device."<< std::endl;
|
||||
|
||||
}
|
||||
}
|
||||
}))
|
||||
#endif
|
||||
{}
|
||||
|
||||
/// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
|
||||
@ -256,22 +270,26 @@ struct SyclDevice {
|
||||
/// returning the sycl queue
|
||||
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
|
||||
/// Here is the implementation of memset function on sycl.
|
||||
template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const {
|
||||
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
|
||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
if (globalid< rng) {
|
||||
for(size_t i=0; i<sizeof(T); i++)
|
||||
buf_acc[globalid*sizeof(T) + i] = c;
|
||||
}
|
||||
});
|
||||
});
|
||||
parallel_for_setup(n, tileSize, rng, GRange);
|
||||
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
|
||||
asynchronousExec();
|
||||
}
|
||||
|
||||
struct memsetCghFunctor{
|
||||
cl::sycl::buffer<uint8_t, 1>& m_buf;
|
||||
const size_t& rng , GRange, tileSize;
|
||||
const int &c;
|
||||
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
|
||||
:m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
|
||||
|
||||
void operator()(cl::sycl::handler &cgh) const {
|
||||
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
// FIXME
|
||||
return 48*1024;
|
||||
|
@ -41,6 +41,9 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
typedef typename MakePointerT::RefType RefType;
|
||||
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -69,7 +69,9 @@ struct TensorEvaluator
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
|
||||
coeffRef(Index index) {
|
||||
eigen_assert(m_data);
|
||||
return m_data[index];
|
||||
}
|
||||
@ -95,7 +97,9 @@ struct TensorEvaluator
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
|
||||
coeffRef(const array<DenseIndex, NumCoords>& coords) {
|
||||
eigen_assert(m_data);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return m_data[m_dims.IndexOfColMajor(coords)];
|
||||
|
@ -46,6 +46,8 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
typedef typename MakePointerT::RefType RefType;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
@ -107,7 +109,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
/// op_ is used for sycl
|
||||
/// op_ is used for sycl
|
||||
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
|
||||
{ }
|
||||
|
||||
|
@ -20,16 +20,20 @@ namespace Eigen {
|
||||
// map_allocator.
|
||||
template<typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
typedef T& RefType;
|
||||
};
|
||||
#if defined(EIGEN_USE_SYCL)
|
||||
namespace TensorSycl {
|
||||
namespace internal{
|
||||
template <typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
|
||||
template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
|
||||
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
|
||||
struct FullReductionKernelFunctor;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
|
||||
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
|
||||
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
|
||||
|
@ -693,10 +693,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_SYCL)
|
||||
template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
|
||||
|
||||
template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
|
||||
template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
|
||||
#endif
|
||||
|
||||
|
||||
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
||||
|
||||
// Returns the Index in the input tensor of the first value that needs to be
|
||||
|
@ -25,8 +25,7 @@
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
|
||||
template<typename CoeffReturnType> struct syclGenericBufferReducer{
|
||||
template<typename BufferTOut, typename BufferTIn>
|
||||
static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
do {
|
||||
@ -35,50 +34,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
|
||||
cl::sycl::range<1>{std::min(length, local)}};
|
||||
/* Two accessors are used: one to the buffer that is being reduced,
|
||||
* and a second to local memory, used to store intermediate data. */
|
||||
auto aI =
|
||||
bufI.template get_access<cl::sycl::access::mode::read_write>(h);
|
||||
auto aOut =
|
||||
bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
|
||||
cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
|
||||
cl::sycl::access::target::local>
|
||||
scratch(cl::sycl::range<1>(local), h);
|
||||
auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
|
||||
auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
|
||||
typedef decltype(aI) InputAccessor;
|
||||
typedef decltype(aOut) OutputAccessor;
|
||||
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
|
||||
LocalAccessor scratch(cl::sycl::range<1>(local), h);
|
||||
|
||||
/* The parallel_for invocation chosen is the variant with an nd_item
|
||||
* parameter, since the code requires barriers for correctness. */
|
||||
h.parallel_for<KernelName>(
|
||||
r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
|
||||
size_t globalid = id.get_global(0);
|
||||
size_t localid = id.get_local(0);
|
||||
/* All threads collectively read from global memory into local.
|
||||
* The barrier ensures all threads' IO is resolved before
|
||||
* execution continues (strictly speaking, all threads within
|
||||
* a single work-group - there is no co-ordination between
|
||||
* work-groups, only work-items). */
|
||||
if (globalid < length) {
|
||||
scratch[localid] = aI[globalid];
|
||||
}
|
||||
id.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
/* Apply the reduction operation between the current local
|
||||
* id and the one on the other half of the vector. */
|
||||
if (globalid < length) {
|
||||
auto min = (length < local) ? length : local;
|
||||
for (size_t offset = min / 2; offset > 0; offset /= 2) {
|
||||
if (localid < offset) {
|
||||
scratch[localid] += scratch[localid + offset];
|
||||
}
|
||||
id.barrier(cl::sycl::access::fence_space::local_space);
|
||||
}
|
||||
/* The final result will be stored in local id 0. */
|
||||
if (localid == 0) {
|
||||
aI[id.get_group(0)] = scratch[localid];
|
||||
if((length<=local) && globalid ==0){
|
||||
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
|
||||
aOutPtr[0]=scratch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local));
|
||||
};
|
||||
dev.sycl_queue().submit(f);
|
||||
dev.asynchronousExec();
|
||||
@ -96,11 +61,11 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
|
||||
|
||||
};
|
||||
|
||||
/// For now let's start with a full reducer
|
||||
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
|
||||
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
|
||||
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
|
||||
// a leafNode.
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
|
||||
@ -109,8 +74,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
|
||||
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
typedef decltype(functors) FunctorExpr;
|
||||
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
|
||||
size_t inputSize =self.impl().dimensions().TotalSize();
|
||||
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
|
||||
@ -135,48 +100,29 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
size_t outTileSize = tileSize;
|
||||
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
|
||||
if (GRange < outTileSize) outTileSize=GRange;
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
auto out_buffer =dev.get_sycl_buffer(output);
|
||||
|
||||
/// creating the shared memory for calculating reduction.
|
||||
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
||||
/// recursively apply reduction on it in order to reduce the whole.
|
||||
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
|
||||
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
||||
Dims dims= self.xprDims();
|
||||
Op functor = reducer;
|
||||
// Dims dims= self.xprDims();
|
||||
//Op functor = reducer;
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
typedef decltype(tuple_of_accessors) TupleType;
|
||||
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
|
||||
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||
/// the device_evaluator is detectable and recognisable on the device.
|
||||
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||
/// const cast added as a naive solution to solve the qualifier drop error
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
|
||||
if(globalid<rng)
|
||||
tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(functor));
|
||||
else
|
||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
|
||||
|
||||
if(remaining!=0 && globalid==0 )
|
||||
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||
tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
|
||||
});
|
||||
typedef decltype(tmp_global_accessor) OutAccessor;
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
|
||||
TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
|
||||
(tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors));
|
||||
});
|
||||
dev.asynchronousExec();
|
||||
|
||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
auto out_buffer =dev.get_sycl_buffer(output);
|
||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||
syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
}
|
||||
|
||||
};
|
||||
@ -190,7 +136,6 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
|
||||
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
typedef decltype(functors) FunctorExpr;
|
||||
typename Self::Index range, GRange, tileSize;
|
||||
@ -208,7 +153,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
||||
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||
TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
||||
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
||||
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
|
||||
|
||||
});
|
||||
|
@ -20,12 +20,14 @@
|
||||
template <class T>
|
||||
struct MakeGlobalPointer {
|
||||
typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
|
||||
typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
|
||||
};
|
||||
|
||||
// global pointer to set different attribute state for a class
|
||||
template <class T>
|
||||
struct MakeLocalPointer {
|
||||
typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
|
||||
typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
|
||||
};
|
||||
|
||||
|
||||
@ -33,6 +35,9 @@ namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
|
||||
|
||||
|
||||
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
|
||||
struct NoOP;
|
||||
|
||||
|
@ -241,21 +241,25 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
|
||||
PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
|
||||
#undef PADDINGOPFUNCEXT
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type.
|
||||
#define SYCLEXTRFUNCCONTRACT(CVQual)\
|
||||
template <typename Indices, typename LHSExpr, typename RHSExpr, typename Dev>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev> > {\
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
|
||||
/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
|
||||
#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
|
||||
template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
|
||||
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
|
||||
const Indices func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev>& expr)\
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\
|
||||
const Param func;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
|
||||
};
|
||||
|
||||
SYCLEXTRFUNCCONTRACT(const)
|
||||
SYCLEXTRFUNCCONTRACT()
|
||||
#undef SYCLEXTRFUNCCONTRACT
|
||||
// TensorContractionOp
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
|
||||
// TensorConcatenationOp
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
|
||||
#undef SYCLEXTRFUNCCONTRACTCONCAT
|
||||
|
||||
|
||||
/// template deduction function for FunctorExtractor
|
||||
|
@ -18,9 +18,53 @@ namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
|
||||
OutputAccessor aOut;
|
||||
InputAccessor aI;
|
||||
LocalAccessor scratch;
|
||||
size_t length, local;
|
||||
GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
|
||||
: aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
size_t globalid = itemID.get_global(0);
|
||||
size_t localid = itemID.get_local(0);
|
||||
/* All threads collectively read from global memory into local.
|
||||
* The barrier ensures all threads' IO is resolved before
|
||||
* execution continues (strictly speaking, all threads within
|
||||
* a single work-group - there is no co-ordination between
|
||||
* work-groups, only work-items). */
|
||||
if (globalid < length) {
|
||||
scratch[localid] = aI[globalid];
|
||||
}
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
/* Apply the reduction operation between the current local
|
||||
* id and the one on the other half of the vector. */
|
||||
if (globalid < length) {
|
||||
auto min = (length < local) ? length : local;
|
||||
for (size_t offset = min / 2; offset > 0; offset /= 2) {
|
||||
if (localid < offset) {
|
||||
scratch[localid] += scratch[localid + offset];
|
||||
}
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
}
|
||||
/* The final result will be stored in local id 0. */
|
||||
if (localid == 0) {
|
||||
aI[itemID.get_group(0)] = scratch[localid];
|
||||
if((length<=local) && globalid ==0){
|
||||
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
|
||||
aOutPtr[0]=scratch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// ReductionFunctor
|
||||
template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
|
||||
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
|
||||
public:
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
|
||||
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
||||
@ -56,6 +100,46 @@ template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, ty
|
||||
};
|
||||
|
||||
|
||||
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
|
||||
struct FullReductionKernelFunctor{
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
OutAccessor tmp_global_accessor;
|
||||
Index rng , remaining, red_factor;
|
||||
Op op;
|
||||
Dims dims;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
|
||||
FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
|
||||
:tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
|
||||
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||
/// the device_evaluator is detectable and recognisable on the device.
|
||||
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||
/// const cast added as a naive solution to solve the qualifier drop error
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
|
||||
if(globalid<rng)
|
||||
tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
|
||||
else
|
||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
|
||||
|
||||
if(remaining!=0 && globalid==0 )
|
||||
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||
tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,31 @@
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
|
||||
|
||||
template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
|
||||
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
|
||||
|
||||
typedef typename Expr::Index Index;
|
||||
Index range;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
ExecExprFunctorKernel(Index range_
|
||||
,
|
||||
FunctorExpr functors_, TupleType tuple_of_accessors_
|
||||
)
|
||||
:range(range_)
|
||||
, functors(functors_), tuple_of_accessors(tuple_of_accessors_)
|
||||
{}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
|
||||
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
|
||||
if (gId < range)
|
||||
device_evaluator.evalScalar(gId);
|
||||
}
|
||||
};
|
||||
/// The run function in tensor sycl convert the expression tree to a buffer
|
||||
/// based expression tree;
|
||||
/// creates the expression tree for the device with accessor to buffers;
|
||||
@ -34,25 +59,19 @@ void run(Expr &expr, Dev &dev) {
|
||||
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
|
||||
auto functors = internal::extractFunctors(evaluator);
|
||||
|
||||
typedef decltype(functors) FunctorExpr;
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
|
||||
typedef decltype(tuple_of_accessors) TupleType;
|
||||
typename Expr::Index range, GRange, tileSize;
|
||||
dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
|
||||
|
||||
// run the kernel
|
||||
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
|
||||
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
|
||||
if (gId < range) {
|
||||
device_evaluator.evalScalar(gId);
|
||||
}
|
||||
});
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||
ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
|
||||
, functors, tuple_of_accessors
|
||||
));
|
||||
});
|
||||
dev.asynchronousExec();
|
||||
}
|
||||
|
@ -58,6 +58,8 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
|
||||
};
|
||||
template <typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
typedef T& RefType;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
@ -76,6 +78,8 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
|
||||
};
|
||||
template <typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
typedef T& RefType;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
@ -98,6 +102,8 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
typedef typename MakePointerT::RefType RefType;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -151,6 +151,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
# It should be safe to always run these tests as there is some fallback code for
|
||||
# older compiler that don't support cxx11.
|
||||
|
180
unsupported/test/cxx11_tensor_concatenation_sycl.cpp
Normal file
180
unsupported/test/cxx11_tensor_concatenation_sycl.cpp
Normal file
@ -0,0 +1,180 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template<typename DataType, int DataLayout, typename Index>
|
||||
static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
Index leftDim1 = 2;
|
||||
Index leftDim2 = 3;
|
||||
Index leftDim3 = 1;
|
||||
Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
|
||||
Index rightDim1 = 2;
|
||||
Index rightDim2 = 3;
|
||||
Index rightDim3 = 1;
|
||||
Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
|
||||
|
||||
//Index concatDim1 = 3;
|
||||
// Index concatDim2 = 3;
|
||||
// Index concatDim3 = 1;
|
||||
//Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
|
||||
|
||||
Tensor<DataType, 3, DataLayout, Index> left(leftRange);
|
||||
Tensor<DataType, 3, DataLayout, Index> right(rightRange);
|
||||
left.setRandom();
|
||||
right.setRandom();
|
||||
|
||||
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
|
||||
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
|
||||
///
|
||||
Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
|
||||
DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
|
||||
|
||||
//concatenation = left.concatenate(right, 0);
|
||||
gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
|
||||
sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
|
||||
VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
|
||||
VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
|
||||
}
|
||||
for (int i = 2; i < 4; ++i) {
|
||||
VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
|
||||
}
|
||||
}
|
||||
|
||||
sycl_device.deallocate(gpu_out_data1);
|
||||
Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
|
||||
DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
|
||||
gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
|
||||
sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
//concatenation = left.concatenate(right, 1);
|
||||
VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
|
||||
VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
|
||||
VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
|
||||
}
|
||||
for (int j = 3; j < 6; ++j) {
|
||||
VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(gpu_out_data2);
|
||||
Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
|
||||
DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
|
||||
gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
|
||||
sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
//concatenation = left.concatenate(right, 2);
|
||||
VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
|
||||
VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
|
||||
VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
|
||||
VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(gpu_out_data3);
|
||||
sycl_device.deallocate(gpu_in1_data);
|
||||
sycl_device.deallocate(gpu_in2_data);
|
||||
}
|
||||
template<typename DataType, int DataLayout, typename Index>
|
||||
static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
|
||||
Index leftDim1 = 2;
|
||||
Index leftDim2 = 3;
|
||||
Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}};
|
||||
|
||||
Index rightDim1 = 2;
|
||||
Index rightDim2 = 3;
|
||||
Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}};
|
||||
|
||||
Index concatDim1 = 4;
|
||||
Index concatDim2 = 3;
|
||||
Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}};
|
||||
|
||||
Tensor<DataType, 2, DataLayout, Index> left(leftRange);
|
||||
Tensor<DataType, 2, DataLayout, Index> right(rightRange);
|
||||
Tensor<DataType, 2, DataLayout, Index> result(resRange);
|
||||
|
||||
left.setRandom();
|
||||
right.setRandom();
|
||||
result.setRandom();
|
||||
|
||||
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange);
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
// t1.concatenate(t2, 0) = result;
|
||||
gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
|
||||
sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
VERIFY_IS_EQUAL(left(i, j), result(i, j));
|
||||
VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(gpu_in1_data);
|
||||
sycl_device.deallocate(gpu_in2_data);
|
||||
sycl_device.deallocate(gpu_out_data);
|
||||
}
|
||||
|
||||
|
||||
template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||
test_simple_concatenation<DataType, RowMajor, int>(sycl_device);
|
||||
test_simple_concatenation<DataType, ColMajor, int>(sycl_device);
|
||||
test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device);
|
||||
}
|
||||
void test_cxx11_tensor_concatenation_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(tensorConcat_perDevice<float>(device));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user