Converting all parallel for lambda to functor in order to prevent kernel duplication name error; adding tensorConcatinationOp backend for sycl.

This commit is contained in:
Mehdi Goli 2016-12-16 19:46:45 +00:00
parent 7949849ebc
commit 35bae513a0
16 changed files with 588 additions and 264 deletions

View File

@ -276,6 +276,12 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
/// required by sycl in order to extract the accessor
const Axis& axis() const { return m_axis; }
protected:
Dimensions m_dimensions;

View File

@ -190,7 +190,159 @@ LeftEvaluator m_leftImpl;
RightEvaluator m_rightImpl;
};
template <typename PLEXPR, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct KernelNameConstructor;
template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
FunctorExpr functors;
LhsLocalAcc localLhs;
RhsLocalAcc localRhs;
OutAccessor out_res;
Index roundUpK, M, N, K;
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
LeftNocontractT m_i_strides, m_left_nocontract_strides;
RightNocontractT m_j_strides, m_right_nocontract_strides;
TupleType tuple_of_accessors;
KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
:functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
m_right_contracting_strides(m_right_contracting_strides_),
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
tuple_of_accessors(tuple_of_accessors_){}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
ContractT, 1,
lhs_inner_dim_contiguous,
false, Unaligned, MakeGlobalPointer> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
typename DevEvaluator::RightEvaluator, RightNocontractT,
ContractT, 1,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
// initialize data mappers must happen inside the kernel for device eval
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
// Matmul Kernel
// Thread identifiers
const int mLocalThreadId = itemID.get_local(0); // Local ID row
const int nLocalThreadId = itemID.get_local(1); // Local ID col
const int mGroupId = itemID.get_group(0); // Work-group ID row
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
// Allocate register space
float privateLhs;
float privateRhs[WorkLoadPerThreadN];
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
// Initialise the privateResumulation registers
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] = 0.0f;
}
}
// Tile Lhs
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// Load the value (wide vector load)
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
}
// Tile Rhs
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
}
// Loop over all tiles
const int numTiles = roundUpK/TileSizeDimK;
int firstHalf=0;
do {
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// Load the next tile of Lhs and Rhs into local memory
int nextHalf = firstHalf + 1;
if (nextHalf < numTiles) {
// Tile A
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// global K id
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
// Store the loaded value into local memory
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
}
// Tile B
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
// Store the loaded vector into local memory
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
}
}
// Loop over the values of a single tile
for (int k=0; k<TileSizeDimK; k++) {
// Cache the values of localRhs in registers
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
}
// Perform the computation
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
}
}
}
// Next tile
firstHalf++;
} while (firstHalf<numTiles);
// Store the final results in C
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
if (globalRow< M){
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
}
}
}
}
};
template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
static const int TileSizeDimM = 32; // Tile size for dimension M
@ -208,149 +360,39 @@ static int RoundUp(int x, int y) {
return ((((x) + (y) - 1) / (y))*(y));
}
template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
static void Run(const Self& self, Output* buffer, Index M, Index N, Index K,
template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
// create a tuple of accessors from Evaluator
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<typename Self::XprType>::Type PlaceHolderExpr;
typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
typedef typename Self::XprType HostExpr;
// typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
// typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
typedef decltype(functors) FunctorExpr;
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
typedef decltype(tuple_of_accessors) TupleType;
// Local memory for elements of Lhs
cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
// Local memory for elements of Rhs
cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
//Output memory
auto out_privateRes= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
//OutScalar memory
auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
typedef decltype(out_res) OutAccessor;
// sycl parallel for
cgh.parallel_for<KernelName>( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) {
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<typename Self::XprType>::Type DevExpr;
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
ContractT, 1,
lhs_inner_dim_contiguous,
false, Unaligned, MakeGlobalPointer> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
typename DevEvaluator::RightEvaluator, RightNocontractT,
ContractT, 1,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
// initialize data mappers must happen inside the kernel for device eval
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes);
// Matmul Kernel
// Thread identifiers
const int mLocalThreadId = itemID.get_local(0); // Local ID row
const int nLocalThreadId = itemID.get_local(1); // Local ID col
const int mGroupId = itemID.get_group(0); // Work-group ID row
const int nGroupId = itemID.get_group(1); // Work-group ID localCol
const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
// Allocate register space
float privateLhs;
float privateRhs[WorkLoadPerThreadN];
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
// Initialise the privateResumulation registers
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] = 0.0f;
}
}
// Tile Lhs
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int
localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// Load the value (wide vector load)
int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<Output>(0);
}
// Tile Rhs
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<Output>(0);
}
// Loop over all tiles
const int numTiles = roundUpK/TileSizeDimK;
int firstHalf=0;
do {
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// Load the next tile of Lhs and Rhs into local memory
int nextHalf = firstHalf + 1;
if (nextHalf < numTiles) {
// Tile A
for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localLhsRow = localLhsLinearId% TileSizeDimM;
int localLhsCol = localLhsLinearId/TileSizeDimM;
// global K id
int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
// Store the loaded value into local memory
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<Output>(0);
}
// Tile B
for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
int localRhsRow = localRhsLinearId% TileSizeDimN;
int localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
// Store the loaded vector into local memory
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<Output>(0);
}
}
// Loop over the values of a single tile
for (int k=0; k<TileSizeDimK; k++) {
// Cache the values of localRhs in registers
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
}
// Perform the computation
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
}
}
}
// Next tile
firstHalf++;
} while (firstHalf<numTiles);
// Store the final results in C
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
if (globalRow< M){
for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
}
}
}
/// End the kernel
});
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
});
self.device().asynchronousExec();
}

View File

@ -43,6 +43,18 @@ namespace Eigen {
size_t m_offset;
};
struct memsetkernelFunctor{
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
AccType m_acc;
const size_t m_rng, m_c;
memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
void operator()(cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
if (globalid< m_rng) m_acc[globalid] = m_c;
}
};
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
auto devices = cl::sycl::device::get_devices();
std::vector<cl::sycl::device>::iterator it =devices.begin();
@ -88,15 +100,17 @@ struct QueueInterface {
}
}
}))
#else
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
for (const auto& e : l) {
if (e) {
exception_caught_ = true;
}
}
}))
#endif
#else
m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
for (const auto& e : l) {
if (e) {
exception_caught_ = true;
std::cerr << "Error detected Inside Sycl Device."<< std::endl;
}
}
}))
#endif
{}
/// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
@ -256,22 +270,26 @@ struct SyclDevice {
/// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
/// Here is the implementation of memset function on sycl.
template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const {
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
size_t rng, GRange, tileSize;
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
if (globalid< rng) {
for(size_t i=0; i<sizeof(T); i++)
buf_acc[globalid*sizeof(T) + i] = c;
}
});
});
parallel_for_setup(n, tileSize, rng, GRange);
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
asynchronousExec();
}
struct memsetCghFunctor{
cl::sycl::buffer<uint8_t, 1>& m_buf;
const size_t& rng , GRange, tileSize;
const int &c;
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
:m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
void operator()(cl::sycl::handler &cgh) const {
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
}
};
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;

View File

@ -41,6 +41,9 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
typedef typename MakePointerT::RefType RefType;
};
};

View File

@ -69,7 +69,9 @@ struct TensorEvaluator
return m_data[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
coeffRef(Index index) {
eigen_assert(m_data);
return m_data[index];
}
@ -95,7 +97,9 @@ struct TensorEvaluator
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
coeffRef(const array<DenseIndex, NumCoords>& coords) {
eigen_assert(m_data);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];

View File

@ -46,6 +46,8 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
typedef typename MakePointerT::RefType RefType;
};
};
@ -107,7 +109,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
/// op_ is used for sycl
/// op_ is used for sycl
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
{ }

View File

@ -20,16 +20,20 @@ namespace Eigen {
// map_allocator.
template<typename T> struct MakePointer {
typedef T* Type;
typedef T& RefType;
};
#if defined(EIGEN_USE_SYCL)
namespace TensorSycl {
namespace internal{
template <typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
struct FullReductionKernelFunctor;
}
}
#endif
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;

View File

@ -693,10 +693,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#endif
#if defined(EIGEN_USE_SYCL)
template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
#endif
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
// Returns the Index in the input tensor of the first value that needs to be

View File

@ -25,8 +25,7 @@
namespace Eigen {
namespace internal {
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
template<typename CoeffReturnType> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
@ -35,50 +34,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
* and a second to local memory, used to store intermediate data. */
auto aI =
bufI.template get_access<cl::sycl::access::mode::read_write>(h);
auto aOut =
bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
scratch(cl::sycl::range<1>(local), h);
auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
typedef decltype(aI) InputAccessor;
typedef decltype(aOut) OutputAccessor;
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(local), h);
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
h.parallel_for<KernelName>(
r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
size_t globalid = id.get_global(0);
size_t localid = id.get_local(0);
/* All threads collectively read from global memory into local.
* The barrier ensures all threads' IO is resolved before
* execution continues (strictly speaking, all threads within
* a single work-group - there is no co-ordination between
* work-groups, only work-items). */
if (globalid < length) {
scratch[localid] = aI[globalid];
}
id.barrier(cl::sycl::access::fence_space::local_space);
/* Apply the reduction operation between the current local
* id and the one on the other half of the vector. */
if (globalid < length) {
auto min = (length < local) ? length : local;
for (size_t offset = min / 2; offset > 0; offset /= 2) {
if (localid < offset) {
scratch[localid] += scratch[localid + offset];
}
id.barrier(cl::sycl::access::fence_space::local_space);
}
/* The final result will be stored in local id 0. */
if (localid == 0) {
aI[id.get_group(0)] = scratch[localid];
if((length<=local) && globalid ==0){
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
aOutPtr[0]=scratch[0];
}
}
}
});
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@ -96,11 +61,11 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
};
/// For now let's start with a full reducer
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
// a leafNode.
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
@ -109,8 +74,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
auto functors = TensorSycl::internal::extractFunctors(self.impl());
typedef decltype(functors) FunctorExpr;
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
size_t inputSize =self.impl().dimensions().TotalSize();
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@ -135,48 +100,29 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
size_t outTileSize = tileSize;
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
if (GRange < outTileSize) outTileSize=GRange;
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
/// creating the shared memory for calculating reduction.
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
/// recursively apply reduction on it in order to reduce the whole.
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
Dims dims= self.xprDims();
Op functor = reducer;
// Dims dims= self.xprDims();
//Op functor = reducer;
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
// create a tuple of accessors from Evaluator
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
typedef decltype(tuple_of_accessors) TupleType;
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id();
if(globalid<rng)
tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(functor));
else
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
if(remaining!=0 && globalid==0 )
// this will add the rest of input buffer when the input size is not devidable to red_factor.
tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
});
typedef decltype(tmp_global_accessor) OutAccessor;
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
(tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors));
});
dev.asynchronousExec();
/// This is used to recursively reduce the tmp value to an element of 1;
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
/// This is used to recursively reduce the tmp value to an element of 1;
syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
}
};
@ -190,7 +136,6 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
auto functors = TensorSycl::internal::extractFunctors(self.impl());
typedef decltype(functors) FunctorExpr;
typename Self::Index range, GRange, tileSize;
@ -208,7 +153,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
});

View File

@ -20,12 +20,14 @@
template <class T>
struct MakeGlobalPointer {
typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
};
// global pointer to set different attribute state for a class
template <class T>
struct MakeLocalPointer {
typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
};
@ -33,6 +35,9 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
struct NoOP;

View File

@ -241,21 +241,25 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
#undef PADDINGOPFUNCEXT
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type.
#define SYCLEXTRFUNCCONTRACT(CVQual)\
template <typename Indices, typename LHSExpr, typename RHSExpr, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev> > {\
/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
const Indices func;\
FunctorExtractor(const TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev>& expr)\
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\
const Param func;\
FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
};
SYCLEXTRFUNCCONTRACT(const)
SYCLEXTRFUNCCONTRACT()
#undef SYCLEXTRFUNCCONTRACT
// TensorContractionOp
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
// TensorConcatenationOp
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
#undef SYCLEXTRFUNCCONTRACTCONCAT
/// template deduction function for FunctorExtractor

View File

@ -18,9 +18,53 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
OutputAccessor aOut;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
: aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
/* All threads collectively read from global memory into local.
* The barrier ensures all threads' IO is resolved before
* execution continues (strictly speaking, all threads within
* a single work-group - there is no co-ordination between
* work-groups, only work-items). */
if (globalid < length) {
scratch[localid] = aI[globalid];
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
/* Apply the reduction operation between the current local
* id and the one on the other half of the vector. */
if (globalid < length) {
auto min = (length < local) ? length : local;
for (size_t offset = min / 2; offset > 0; offset /= 2) {
if (localid < offset) {
scratch[localid] += scratch[localid + offset];
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
/* The final result will be stored in local id 0. */
if (localid == 0) {
aI[itemID.get_group(0)] = scratch[localid];
if((length<=local) && globalid ==0){
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
aOutPtr[0]=scratch[0];
}
}
}
}
};
/// ReductionFunctor
template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
@ -56,6 +100,46 @@ template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, ty
};
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
struct FullReductionKernelFunctor{
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
OutAccessor tmp_global_accessor;
Index rng , remaining, red_factor;
Op op;
Dims dims;
FunctorExpr functors;
TupleType tuple_of_accessors;
FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
:tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id();
if(globalid<rng)
tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
else
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
if(remaining!=0 && globalid==0 )
// this will add the rest of input buffer when the input size is not devidable to red_factor.
tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
}
};
}
}
}

View File

@ -25,6 +25,31 @@
namespace Eigen {
namespace TensorSycl {
template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
typedef typename Expr::Index Index;
Index range;
FunctorExpr functors;
TupleType tuple_of_accessors;
ExecExprFunctorKernel(Index range_
,
FunctorExpr functors_, TupleType tuple_of_accessors_
)
:range(range_)
, functors(functors_), tuple_of_accessors(tuple_of_accessors_)
{}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
if (gId < range)
device_evaluator.evalScalar(gId);
}
};
/// The run function in tensor sycl convert the expression tree to a buffer
/// based expression tree;
/// creates the expression tree for the device with accessor to buffers;
@ -34,25 +59,19 @@ void run(Expr &expr, Dev &dev) {
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
auto functors = internal::extractFunctors(evaluator);
typedef decltype(functors) FunctorExpr;
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
// create a tuple of accessors from Evaluator
auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
typedef decltype(tuple_of_accessors) TupleType;
typename Expr::Index range, GRange, tileSize;
dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
// run the kernel
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
if (gId < range) {
device_evaluator.evalScalar(gId);
}
});
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
, functors, tuple_of_accessors
));
});
dev.asynchronousExec();
}

View File

@ -58,6 +58,8 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
typedef T& RefType;
};
};
@ -76,6 +78,8 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
typedef T& RefType;
};
};
@ -98,6 +102,8 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
typedef typename MakePointerT::RefType RefType;
};
};

View File

@ -151,6 +151,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.

View File

@ -0,0 +1,180 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<typename DataType, int DataLayout, typename Index>
static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
{
Index leftDim1 = 2;
Index leftDim2 = 3;
Index leftDim3 = 1;
Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
Index rightDim1 = 2;
Index rightDim2 = 3;
Index rightDim3 = 1;
Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
//Index concatDim1 = 3;
// Index concatDim2 = 3;
// Index concatDim3 = 1;
//Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
Tensor<DataType, 3, DataLayout, Index> left(leftRange);
Tensor<DataType, 3, DataLayout, Index> right(rightRange);
left.setRandom();
right.setRandom();
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
///
Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
//concatenation = left.concatenate(right, 0);
gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
}
for (int i = 2; i < 4; ++i) {
VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
}
}
sycl_device.deallocate(gpu_out_data1);
Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
//concatenation = left.concatenate(right, 1);
VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
}
for (int j = 3; j < 6; ++j) {
VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
}
}
sycl_device.deallocate(gpu_out_data2);
Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
//concatenation = left.concatenate(right, 2);
VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
}
}
sycl_device.deallocate(gpu_out_data3);
sycl_device.deallocate(gpu_in1_data);
sycl_device.deallocate(gpu_in2_data);
}
template<typename DataType, int DataLayout, typename Index>
static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
{
Index leftDim1 = 2;
Index leftDim2 = 3;
Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}};
Index rightDim1 = 2;
Index rightDim2 = 3;
Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}};
Index concatDim1 = 4;
Index concatDim2 = 3;
Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}};
Tensor<DataType, 2, DataLayout, Index> left(leftRange);
Tensor<DataType, 2, DataLayout, Index> right(rightRange);
Tensor<DataType, 2, DataLayout, Index> result(resRange);
left.setRandom();
right.setRandom();
result.setRandom();
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
// t1.concatenate(t2, 0) = result;
gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(left(i, j), result(i, j));
VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
}
}
sycl_device.deallocate(gpu_in1_data);
sycl_device.deallocate(gpu_in2_data);
sycl_device.deallocate(gpu_out_data);
}
template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
test_simple_concatenation<DataType, RowMajor, int>(sycl_device);
test_simple_concatenation<DataType, ColMajor, int>(sycl_device);
test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device);
}
void test_cxx11_tensor_concatenation_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(tensorConcat_perDevice<float>(device));
}
}