Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used.

This commit is contained in:
Mehdi Goli 2017-02-28 17:16:14 +00:00
parent e0bd6f5738
commit 8296b87d7b
15 changed files with 397 additions and 67 deletions

View File

@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
if (data) {
evalTo(data);
return false;
} else {
@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
LhsLocalAcc localLhs;
RhsLocalAcc localRhs;
OutAccessor out_res;
size_t out_offset;
Index roundUpK, M, N, K;
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
LeftNocontractT m_i_strides, m_left_nocontract_strides;
@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
Device dev;
KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
:lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
:lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
m_right_contracting_strides(m_right_contracting_strides_),
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
}
}
}
@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
// extract lhs functor list
LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
// extract rhs functor list
RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
ptrdiff_t out_offset = self.device().get_offset(buffer);
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
/// work-around for gcc bug
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
//OutScalar memory
OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
// sycl parallel for
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
});
self.device().asynchronousExec();

View File

@ -32,14 +32,15 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize, range_x, range_y;
Buffer_accessor buffer_acc;
ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<2> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
buffer_ptr[tensor_index] = result;
buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@ -89,14 +90,15 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
Buffer_accessor buffer_acc;
ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
buffer_ptr[tensor_index] = result;
buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@ -156,16 +158,17 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
Buffer_accessor buffer_acc;
ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
buffer_ptr[tensor_index] = result;
buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_kernel = in_place;
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
// extract input functor list
InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
ptrdiff_t out_offset = m_device.get_offset(data);
m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}

View File

@ -140,6 +140,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
/// used by sycl in order to build the sycl buffer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
@ -295,6 +298,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
/// used by sycl in order to build the sycl buffer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);

View File

@ -18,6 +18,8 @@
namespace Eigen {
#define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
public:
@ -43,11 +45,12 @@ namespace Eigen {
struct memsetkernelFunctor{
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
AccType m_acc;
const ptrdiff_t buff_offset;
const size_t m_rng, m_c;
memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){}
void operator()(cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
if (globalid< m_rng) m_acc[globalid] = m_c;
if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
}
};
@ -305,6 +308,11 @@ struct SyclDevice {
synchronize();
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
auto it = m_queue_stream->find_buffer(ptr);
return (static_cast<const uint8_t*>(ptr))-it->first;
}
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
/// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
/// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
@ -343,20 +351,23 @@ struct SyclDevice {
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
size_t rng, GRange, tileSize;
parallel_for_setup(n, tileSize, rng, GRange);
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(data));
ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
synchronize();
}
struct memsetCghFunctor{
cl::sycl::buffer<uint8_t, 1>& m_buf;
const ptrdiff_t& buff_offset;
const size_t& rng , GRange, tileSize;
const int &c;
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
:m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
:m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
void operator()(cl::sycl::handler &cgh) const {
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c));
}
};

View File

@ -27,9 +27,9 @@ namespace internal {
template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
template<typename BufferTOut, typename BufferTIn>
static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
bufOut, bufI, dev, length, local);
bufOut, out_offset, bufI, dev, length, local);
}
};
@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
ptrdiff_t out_offset = dev.get_offset(output);
/// This is used to recursively reduce the tmp value to an element of 1;
syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize);
}
};
@ -158,10 +159,11 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
// create a tuple of accessors from Evaluator
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
ptrdiff_t out_offset = dev.get_offset(output);
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
(output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
});
dev.asynchronousExec();

View File

@ -109,6 +109,7 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
};
// TensorForcedEvalOp
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp)
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp)

View File

@ -236,8 +236,12 @@ EVALTO()
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
typedef TensorForcedEvalOp<OrigExpr> XprType;\
typedef CVQual TensorMap<\
Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
Eigen::internal::traits<XprType>::Layout, \
MakeGlobalPointer\
> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@ -248,6 +252,28 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
#define TENSORCUSTOMUNARYOP(CVQual)\
template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
typedef CVQual TensorMap<\
Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
Eigen::internal::traits<XprType>::Layout, \
MakeGlobalPointer\
> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
: expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
};
TENSORCUSTOMUNARYOP(const)
TENSORCUSTOMUNARYOP()
#undef TENSORCUSTOMUNARYOP
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
static const size_t Res =X;
};
@ -260,7 +286,7 @@ template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
Type expr;\
@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR()
/// specialisation of the \ref ExprConstructor struct when the node type is
/// TensorContractionOp
#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
typedef CVQual TensorMap<\
Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
Eigen::internal::traits<XprTyp>::Layout, \
MakeGlobalPointer\
> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
};
SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
#undef SYCLCONTRACTIONCONVOLUTION
SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
#undef SYCLCONTRACTCONVCUSBIOPS
#define SYCLSLICEOPEXPR(CVQual)\

View File

@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC()
#undef SYCLFORCEDEVALEXTACC
#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
template <typename CustomUnaryFunc, typename XprType, typename Dev >\
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
SYCLCUSTOMUNARYOPEXTACC(const)
SYCLCUSTOMUNARYOPEXTACC()
#undef SYCLCUSTOMUNARYOPEXTACC
#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
SYCLCUSTOMBINARYOPEXTACC(const)
SYCLCUSTOMBINARYOPEXTACC()
#undef SYCLCUSTOMBIBARYOPEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
#define SYCLEVALTOEXTACC(CVQual)\
template <typename Expr, typename Dev>\

View File

@ -33,14 +33,17 @@ namespace internal {
/// re-instantiate them on the device.
/// We have to pass instantiated functors to the device.
// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
#define DEFALTACTION(Evaluator)\
typedef typename Evaluator::Dimensions Dimensions;\
const Dimensions m_dimensions;\
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
template <typename Evaluator> struct FunctorExtractor{
typedef typename Evaluator::Dimensions Dimensions;
const Dimensions m_dimensions;
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
FunctorExtractor(const Evaluator& expr)
: m_dimensions(expr.dimensions()) {}
DEFALTACTION(Evaluator)
};
/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
///TensorConversionOp
#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const)
SYCLEXTRFUNCTERNARY()
#undef SYCLEXTRFUNCTERNARY
//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
//from the UnaryCategory and it is similar to the general FunctorExtractor.
/// specialisation of TensorCustomOp
#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
DEFALTACTION(Evaluator)\
};
SYCLEXTRFUNCCUSTOMUNARYOP(const)
SYCLEXTRFUNCCUSTOMUNARYOP()
#undef SYCLEXTRFUNCCUSTOMUNARYOP
#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
DEFALTACTION(Evaluator)\
};
SYCLEXTRFUNCCUSTOMBIBARYOP(const)
SYCLEXTRFUNCCUSTOMBIBARYOP()
#undef SYCLEXTRFUNCCUSTOMBIBARYOP
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
#define SYCLEXTRFUNCSELECTOP(CVQual)\

View File

@ -21,11 +21,12 @@ namespace internal {
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
OP op;
OutputAccessor aOut;
ptrdiff_t out_offset;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
: op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
: op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
@ -59,7 +60,7 @@ namespace internal {
aI[itemID.get_group(0)] = scratch[localid];
if((length<=local) && globalid ==0){
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
aOutPtr[0]=scratch[0];
aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
}
}
}
@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
output_accessor_ptr[globalid]= accum;
output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
}
}
private:
write_accessor output_accessor;
ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;
@ -111,9 +113,9 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
output_accessor_ptr[globalid]= accum/num_values_to_reduce;
output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
}
}
private:
write_accessor output_accessor;
ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;

View File

@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const)
SYCLFORCEDEVALLEAFCOUNT()
#undef SYCLFORCEDEVALLEAFCOUNT
#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
template <typename CustomUnaryFunc, typename XprType>\
struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
static const size_t Count =1;\
};
SYCLCUSTOMUNARYOPLEAFCOUNT(const)
SYCLCUSTOMUNARYOPLEAFCOUNT()
#undef SYCLCUSTOMUNARYOPLEAFCOUNT
#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
static const size_t Count =1;\
};
SYCLCUSTOMBINARYOPLEAFCOUNT( const)
SYCLCUSTOMBINARYOPLEAFCOUNT()
#undef SYCLCUSTOMBINARYOPLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\
template <typename Expr>\

View File

@ -143,6 +143,33 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorForcedEvalOp
#define CUSTOMUNARYOPEVAL(CVQual)\
template <typename CustomUnaryFunc, typename XprType, size_t N>\
struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
};
CUSTOMUNARYOPEVAL(const)
CUSTOMUNARYOPEVAL()
#undef CUSTOMUNARYOPEVAL
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorForcedEvalOp
#define CUSTOMBINARYOPEVAL(CVQual)\
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
};
CUSTOMBINARYOPEVAL(const)
CUSTOMBINARYOPEVAL()
#undef CUSTOMBINARYOPEVAL
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorEvalToOp, TensorLayoutSwapOp
#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\

View File

@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.

View File

@ -0,0 +1,165 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<typename TensorType>
struct InsertZeros {
DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
DSizes<DenseIndex, 2> result;
result[0] = input.dimension(0) * 2;
result[1] = input.dimension(1) * 2;
return result;
}
template <typename Output, typename Device>
void eval(const TensorType& input, Output& output, const Device& device) const
{
array<DenseIndex, 2> strides;
strides[0] = 2;
strides[1] = 2;
output.stride(strides).device(device) = input;
Eigen::DSizes<DenseIndex, 2> offsets(1,1);
Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
}
};
template<typename DataType, int DataLayout, typename IndexType>
static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
{
IndexType sizeDim1 = 3;
IndexType sizeDim2 = 5;
Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
TensorType gpu_in1(gpu_in1_data, tensorRange);
TensorType gpu_out(gpu_out_data, tensorResultRange);
in1.setRandom();
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
VERIFY_IS_EQUAL(out.dimension(0), 6);
VERIFY_IS_EQUAL(out.dimension(1), 10);
for (int i = 0; i < 6; i+=2) {
for (int j = 0; j < 10; j+=2) {
VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
}
}
for (int i = 1; i < 6; i+=2) {
for (int j = 1; j < 10; j+=2) {
VERIFY_IS_EQUAL(out(i, j), 0);
}
}
}
template<typename TensorType>
struct BatchMatMul {
DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
DSizes<DenseIndex, 3> result;
result[0] = input1.dimension(0);
result[1] = input2.dimension(1);
result[2] = input2.dimension(2);
return result;
}
template <typename Output, typename Device>
void eval(const TensorType& input1, const TensorType& input2,
Output& output, const Device& device) const
{
typedef typename TensorType::DimensionPair DimPair;
array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
for (int64_t i = 0; i < output.dimension(2); ++i) {
output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
}
}
};
template<typename DataType, int DataLayout, typename IndexType>
static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
{
Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}};
Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
TensorType gpu_in1(gpu_in1_data, tensorRange1);
TensorType gpu_in2(gpu_in2_data, tensorRange2);
TensorType gpu_out(gpu_out_data, tensorResultRange);
in1.setRandom();
in2.setRandom();
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
for (IndexType i = 0; i < 5; ++i) {
typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
for (IndexType j = 0; j < 2; ++j) {
for (IndexType k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(val(j, k), reference(j, k));
}
}
}
}
template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_custom_op_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(custom_op_perDevice<float>(device));
}
}

View File

@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
/// c=(a+b)*b
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));