mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used.
This commit is contained in:
parent
e0bd6f5738
commit
8296b87d7b
@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
if (data) {
|
||||
evalTo(data);
|
||||
return false;
|
||||
} else {
|
||||
@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
|
||||
LhsLocalAcc localLhs;
|
||||
RhsLocalAcc localRhs;
|
||||
OutAccessor out_res;
|
||||
size_t out_offset;
|
||||
Index roundUpK, M, N, K;
|
||||
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
|
||||
LeftNocontractT m_i_strides, m_left_nocontract_strides;
|
||||
@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
|
||||
Device dev;
|
||||
|
||||
|
||||
KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
|
||||
KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
|
||||
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
|
||||
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
|
||||
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
|
||||
:lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
|
||||
:lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
|
||||
out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
|
||||
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
|
||||
m_right_contracting_strides(m_right_contracting_strides_),
|
||||
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
|
||||
@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
|
||||
for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
|
||||
Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
|
||||
if(globalCol<N)
|
||||
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
|
||||
out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
|
||||
// extract lhs functor list
|
||||
LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
|
||||
// extract rhs functor list
|
||||
RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
|
||||
RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
|
||||
|
||||
Index roundUpK = RoundUp(K, TileSizeDimK);
|
||||
Index roundUpM = RoundUp(M, TileSizeDimM);
|
||||
Index roundUpN = RoundUp(N, TileSizeDimN);
|
||||
|
||||
ptrdiff_t out_offset = self.device().get_offset(buffer);
|
||||
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
/// work-around for gcc bug
|
||||
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
|
||||
@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
|
||||
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
|
||||
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
|
||||
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
|
||||
//OutScalar memory
|
||||
OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
|
||||
OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
|
||||
// sycl parallel for
|
||||
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
|
||||
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
|
||||
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
|
||||
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
|
||||
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
|
||||
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
|
||||
localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
|
||||
m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
|
||||
});
|
||||
self.device().asynchronousExec();
|
||||
|
@ -32,14 +32,15 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize, range_x, range_y;
|
||||
Buffer_accessor buffer_acc;
|
||||
ptrdiff_t out_offset;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<2> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
|
||||
buffer_ptr[tensor_index] = result;
|
||||
buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -89,14 +90,15 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
|
||||
Buffer_accessor buffer_acc;
|
||||
ptrdiff_t out_offset;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
|
||||
buffer_ptr[tensor_index] = result;
|
||||
buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -156,16 +158,17 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
|
||||
Buffer_accessor buffer_acc;
|
||||
ptrdiff_t out_offset;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
|
||||
const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
|
||||
kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
|
||||
buffer_ptr[tensor_index] = result;
|
||||
buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
|
||||
}
|
||||
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
m_kernel = in_place;
|
||||
m_local_kernel = false;
|
||||
} else {
|
||||
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
|
||||
ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
|
||||
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
|
||||
typedef TensorEvalToOp<const KernelArgType> EvalTo;
|
||||
EvalTo evalToTmp(local, m_kernelArg);
|
||||
@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
|
||||
// extract input functor list
|
||||
InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
|
||||
ptrdiff_t out_offset = m_device.get_offset(data);
|
||||
|
||||
|
||||
m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
|
||||
EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
|
||||
EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
|
||||
numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -140,6 +140,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
|
||||
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
|
||||
@ -295,6 +298,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
|
||||
TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
|
||||
|
@ -18,6 +18,8 @@
|
||||
namespace Eigen {
|
||||
|
||||
#define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
|
||||
#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
|
||||
|
||||
|
||||
template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
|
||||
public:
|
||||
@ -43,11 +45,12 @@ namespace Eigen {
|
||||
struct memsetkernelFunctor{
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
|
||||
AccType m_acc;
|
||||
const ptrdiff_t buff_offset;
|
||||
const size_t m_rng, m_c;
|
||||
memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
|
||||
memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
if (globalid< m_rng) m_acc[globalid] = m_c;
|
||||
if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
|
||||
}
|
||||
|
||||
};
|
||||
@ -305,6 +308,11 @@ struct SyclDevice {
|
||||
synchronize();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
|
||||
auto it = m_queue_stream->find_buffer(ptr);
|
||||
return (static_cast<const uint8_t*>(ptr))-it->first;
|
||||
|
||||
}
|
||||
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
|
||||
/// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
|
||||
/// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
|
||||
@ -343,20 +351,23 @@ struct SyclDevice {
|
||||
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n, tileSize, rng, GRange);
|
||||
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
|
||||
auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(data));
|
||||
ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
|
||||
sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
|
||||
synchronize();
|
||||
}
|
||||
|
||||
struct memsetCghFunctor{
|
||||
cl::sycl::buffer<uint8_t, 1>& m_buf;
|
||||
const ptrdiff_t& buff_offset;
|
||||
const size_t& rng , GRange, tileSize;
|
||||
const int &c;
|
||||
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
|
||||
:m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
|
||||
memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
|
||||
:m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
|
||||
|
||||
void operator()(cl::sycl::handler &cgh) const {
|
||||
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -27,9 +27,9 @@ namespace internal {
|
||||
|
||||
template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
|
||||
template<typename BufferTOut, typename BufferTIn>
|
||||
static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
do {
|
||||
auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||
auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
|
||||
cl::sycl::range<1>{std::min(length, local)}};
|
||||
/* Two accessors are used: one to the buffer that is being reduced,
|
||||
@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
|
||||
|
||||
/* The parallel_for invocation chosen is the variant with an nd_item
|
||||
* parameter, since the code requires barriers for correctness. */
|
||||
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
|
||||
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local));
|
||||
};
|
||||
dev.sycl_queue().submit(f);
|
||||
dev.asynchronousExec();
|
||||
@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
|
||||
|
||||
template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
|
||||
template<typename BufferTOut, typename BufferTIn>
|
||||
static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
|
||||
bufOut, bufI, dev, length, local);
|
||||
bufOut, out_offset, bufI, dev, length, local);
|
||||
}
|
||||
};
|
||||
|
||||
@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
auto out_buffer =dev.get_sycl_buffer(output);
|
||||
ptrdiff_t out_offset = dev.get_offset(output);
|
||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||
syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize);
|
||||
}
|
||||
|
||||
};
|
||||
@ -158,10 +159,11 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
// create a tuple of accessors from Evaluator
|
||||
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
||||
ptrdiff_t out_offset = dev.get_offset(output);
|
||||
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
||||
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
|
||||
(output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
|
||||
|
||||
});
|
||||
dev.asynchronousExec();
|
||||
|
@ -109,6 +109,7 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
|
||||
typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
|
||||
};
|
||||
|
||||
|
||||
// TensorForcedEvalOp
|
||||
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp)
|
||||
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp)
|
||||
|
@ -236,8 +236,12 @@ EVALTO()
|
||||
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
|
||||
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
|
||||
TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
|
||||
typedef TensorForcedEvalOp<OrigExpr> XprType;\
|
||||
typedef CVQual TensorMap<\
|
||||
Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
|
||||
Eigen::internal::traits<XprType>::Layout, \
|
||||
MakeGlobalPointer\
|
||||
> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
@ -248,6 +252,28 @@ FORCEDEVAL(const)
|
||||
FORCEDEVAL()
|
||||
#undef FORCEDEVAL
|
||||
|
||||
|
||||
|
||||
#define TENSORCUSTOMUNARYOP(CVQual)\
|
||||
template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
|
||||
CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
|
||||
typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
|
||||
typedef CVQual TensorMap<\
|
||||
Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
|
||||
Eigen::internal::traits<XprType>::Layout, \
|
||||
MakeGlobalPointer\
|
||||
> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
: expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
TENSORCUSTOMUNARYOP(const)
|
||||
TENSORCUSTOMUNARYOP()
|
||||
#undef TENSORCUSTOMUNARYOP
|
||||
|
||||
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
|
||||
static const size_t Res =X;
|
||||
};
|
||||
@ -260,7 +286,7 @@ template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
|
||||
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
|
||||
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
|
||||
static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
|
||||
static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
|
||||
NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR()
|
||||
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorContractionOp
|
||||
#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
|
||||
/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
|
||||
#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
|
||||
template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
|
||||
CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
|
||||
static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
|
||||
typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
|
||||
NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
|
||||
typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
|
||||
Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
|
||||
typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
|
||||
static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
|
||||
typedef CVQual TensorMap<\
|
||||
Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
|
||||
Eigen::internal::traits<XprTyp>::Layout, \
|
||||
MakeGlobalPointer\
|
||||
> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
|
||||
#undef SYCLCONTRACTIONCONVOLUTION
|
||||
|
||||
SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
|
||||
SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
|
||||
SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
|
||||
SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
|
||||
SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
|
||||
SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
|
||||
#undef SYCLCONTRACTCONVCUSBIOPS
|
||||
|
||||
|
||||
#define SYCLSLICEOPEXPR(CVQual)\
|
||||
|
@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC()
|
||||
#undef SYCLFORCEDEVALEXTACC
|
||||
|
||||
|
||||
#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
|
||||
template <typename CustomUnaryFunc, typename XprType, typename Dev >\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
|
||||
};
|
||||
|
||||
|
||||
SYCLCUSTOMUNARYOPEXTACC(const)
|
||||
SYCLCUSTOMUNARYOPEXTACC()
|
||||
#undef SYCLCUSTOMUNARYOPEXTACC
|
||||
|
||||
|
||||
#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
|
||||
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
|
||||
};
|
||||
|
||||
SYCLCUSTOMBINARYOPEXTACC(const)
|
||||
SYCLCUSTOMBINARYOPEXTACC()
|
||||
#undef SYCLCUSTOMBIBARYOPEXTACC
|
||||
|
||||
|
||||
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
|
||||
#define SYCLEVALTOEXTACC(CVQual)\
|
||||
template <typename Expr, typename Dev>\
|
||||
|
@ -33,14 +33,17 @@ namespace internal {
|
||||
/// re-instantiate them on the device.
|
||||
/// We have to pass instantiated functors to the device.
|
||||
// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
|
||||
#define DEFALTACTION(Evaluator)\
|
||||
typedef typename Evaluator::Dimensions Dimensions;\
|
||||
const Dimensions m_dimensions;\
|
||||
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
|
||||
FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
|
||||
|
||||
template <typename Evaluator> struct FunctorExtractor{
|
||||
typedef typename Evaluator::Dimensions Dimensions;
|
||||
const Dimensions m_dimensions;
|
||||
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
FunctorExtractor(const Evaluator& expr)
|
||||
: m_dimensions(expr.dimensions()) {}
|
||||
DEFALTACTION(Evaluator)
|
||||
};
|
||||
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
|
||||
///TensorConversionOp
|
||||
#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
|
||||
@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const)
|
||||
SYCLEXTRFUNCTERNARY()
|
||||
#undef SYCLEXTRFUNCTERNARY
|
||||
|
||||
|
||||
|
||||
//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
|
||||
//from the UnaryCategory and it is similar to the general FunctorExtractor.
|
||||
/// specialisation of TensorCustomOp
|
||||
#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
|
||||
template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
|
||||
typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
|
||||
DEFALTACTION(Evaluator)\
|
||||
};
|
||||
|
||||
SYCLEXTRFUNCCUSTOMUNARYOP(const)
|
||||
SYCLEXTRFUNCCUSTOMUNARYOP()
|
||||
#undef SYCLEXTRFUNCCUSTOMUNARYOP
|
||||
|
||||
|
||||
#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
|
||||
template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
|
||||
typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
|
||||
DEFALTACTION(Evaluator)\
|
||||
};
|
||||
|
||||
SYCLEXTRFUNCCUSTOMBIBARYOP(const)
|
||||
SYCLEXTRFUNCCUSTOMBIBARYOP()
|
||||
#undef SYCLEXTRFUNCCUSTOMBIBARYOP
|
||||
|
||||
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
|
||||
#define SYCLEXTRFUNCSELECTOP(CVQual)\
|
||||
|
@ -21,11 +21,12 @@ namespace internal {
|
||||
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
|
||||
OP op;
|
||||
OutputAccessor aOut;
|
||||
ptrdiff_t out_offset;
|
||||
InputAccessor aI;
|
||||
LocalAccessor scratch;
|
||||
size_t length, local;
|
||||
GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
|
||||
: op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
|
||||
GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
|
||||
: op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
size_t globalid = itemID.get_global(0);
|
||||
size_t localid = itemID.get_local(0);
|
||||
@ -59,7 +60,7 @@ namespace internal {
|
||||
aI[itemID.get_group(0)] = scratch[localid];
|
||||
if((length<=local) && globalid ==0){
|
||||
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
|
||||
aOutPtr[0]=scratch[0];
|
||||
aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
|
||||
public:
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
|
||||
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
||||
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
|
||||
:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
|
||||
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
|
||||
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
||||
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
||||
functor.finalize(accum);
|
||||
output_accessor_ptr[globalid]= accum;
|
||||
output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
|
||||
}
|
||||
}
|
||||
private:
|
||||
write_accessor output_accessor;
|
||||
ptrdiff_t out_offset;
|
||||
FunctorExpr functors;
|
||||
Tuple_of_Acc tuple_of_accessors;
|
||||
Dims dims;
|
||||
@ -111,9 +113,9 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
|
||||
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
|
||||
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
|
||||
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
|
||||
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
|
||||
:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
|
||||
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
|
||||
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
||||
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
||||
functor.finalize(accum);
|
||||
output_accessor_ptr[globalid]= accum/num_values_to_reduce;
|
||||
output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
|
||||
}
|
||||
}
|
||||
private:
|
||||
write_accessor output_accessor;
|
||||
ptrdiff_t out_offset;
|
||||
FunctorExpr functors;
|
||||
Tuple_of_Acc tuple_of_accessors;
|
||||
Dims dims;
|
||||
|
@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const)
|
||||
SYCLFORCEDEVALLEAFCOUNT()
|
||||
#undef SYCLFORCEDEVALLEAFCOUNT
|
||||
|
||||
#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
|
||||
template <typename CustomUnaryFunc, typename XprType>\
|
||||
struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
|
||||
static const size_t Count =1;\
|
||||
};
|
||||
|
||||
SYCLCUSTOMUNARYOPLEAFCOUNT(const)
|
||||
SYCLCUSTOMUNARYOPLEAFCOUNT()
|
||||
#undef SYCLCUSTOMUNARYOPLEAFCOUNT
|
||||
|
||||
|
||||
#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
|
||||
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
|
||||
struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
|
||||
static const size_t Count =1;\
|
||||
};
|
||||
SYCLCUSTOMBINARYOPLEAFCOUNT( const)
|
||||
SYCLCUSTOMBINARYOPLEAFCOUNT()
|
||||
#undef SYCLCUSTOMBINARYOPLEAFCOUNT
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
|
||||
#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\
|
||||
template <typename Expr>\
|
||||
|
@ -143,6 +143,33 @@ FORCEDEVAL(const)
|
||||
FORCEDEVAL()
|
||||
#undef FORCEDEVAL
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorForcedEvalOp
|
||||
#define CUSTOMUNARYOPEVAL(CVQual)\
|
||||
template <typename CustomUnaryFunc, typename XprType, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
|
||||
typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
|
||||
};
|
||||
|
||||
CUSTOMUNARYOPEVAL(const)
|
||||
CUSTOMUNARYOPEVAL()
|
||||
#undef CUSTOMUNARYOPEVAL
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorForcedEvalOp
|
||||
#define CUSTOMBINARYOPEVAL(CVQual)\
|
||||
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
|
||||
typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
|
||||
};
|
||||
|
||||
CUSTOMBINARYOPEVAL(const)
|
||||
CUSTOMBINARYOPEVAL()
|
||||
#undef CUSTOMBINARYOPEVAL
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorEvalToOp, TensorLayoutSwapOp
|
||||
#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\
|
||||
|
@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11")
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
# It should be safe to always run these tests as there is some fallback code for
|
||||
# older compiler that don't support cxx11.
|
||||
|
165
unsupported/test/cxx11_tensor_custom_op_sycl.cpp
Normal file
165
unsupported/test/cxx11_tensor_custom_op_sycl.cpp
Normal file
@ -0,0 +1,165 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
template<typename TensorType>
|
||||
struct InsertZeros {
|
||||
DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
|
||||
DSizes<DenseIndex, 2> result;
|
||||
result[0] = input.dimension(0) * 2;
|
||||
result[1] = input.dimension(1) * 2;
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename Output, typename Device>
|
||||
void eval(const TensorType& input, Output& output, const Device& device) const
|
||||
{
|
||||
array<DenseIndex, 2> strides;
|
||||
strides[0] = 2;
|
||||
strides[1] = 2;
|
||||
output.stride(strides).device(device) = input;
|
||||
|
||||
Eigen::DSizes<DenseIndex, 2> offsets(1,1);
|
||||
Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
|
||||
output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
|
||||
{
|
||||
IndexType sizeDim1 = 3;
|
||||
IndexType sizeDim2 = 5;
|
||||
Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
|
||||
Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
|
||||
|
||||
Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
|
||||
Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
|
||||
|
||||
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
|
||||
TensorType gpu_in1(gpu_in1_data, tensorRange);
|
||||
TensorType gpu_out(gpu_out_data, tensorResultRange);
|
||||
|
||||
in1.setRandom();
|
||||
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
|
||||
gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
VERIFY_IS_EQUAL(out.dimension(0), 6);
|
||||
VERIFY_IS_EQUAL(out.dimension(1), 10);
|
||||
|
||||
for (int i = 0; i < 6; i+=2) {
|
||||
for (int j = 0; j < 10; j+=2) {
|
||||
VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
|
||||
}
|
||||
}
|
||||
for (int i = 1; i < 6; i+=2) {
|
||||
for (int j = 1; j < 10; j+=2) {
|
||||
VERIFY_IS_EQUAL(out(i, j), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TensorType>
|
||||
struct BatchMatMul {
|
||||
DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
|
||||
DSizes<DenseIndex, 3> result;
|
||||
result[0] = input1.dimension(0);
|
||||
result[1] = input2.dimension(1);
|
||||
result[2] = input2.dimension(2);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename Output, typename Device>
|
||||
void eval(const TensorType& input1, const TensorType& input2,
|
||||
Output& output, const Device& device) const
|
||||
{
|
||||
typedef typename TensorType::DimensionPair DimPair;
|
||||
array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(1, 0);
|
||||
for (int64_t i = 0; i < output.dimension(2); ++i) {
|
||||
output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
|
||||
{
|
||||
|
||||
Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
|
||||
Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
|
||||
Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}};
|
||||
|
||||
Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
|
||||
Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
|
||||
Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
|
||||
|
||||
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
|
||||
TensorType gpu_in1(gpu_in1_data, tensorRange1);
|
||||
TensorType gpu_in2(gpu_in2_data, tensorRange2);
|
||||
TensorType gpu_out(gpu_out_data, tensorResultRange);
|
||||
|
||||
in1.setRandom();
|
||||
in2.setRandom();
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
for (IndexType i = 0; i < 5; ++i) {
|
||||
typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
|
||||
array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(1, 0);
|
||||
Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
|
||||
TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
|
||||
for (IndexType j = 0; j < 2; ++j) {
|
||||
for (IndexType k = 0; k < 7; ++k) {
|
||||
VERIFY_IS_APPROX(val(j, k), reference(j, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||
test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||
test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||
test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||
test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||
|
||||
}
|
||||
void test_cxx11_tensor_custom_op_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(custom_op_perDevice<float>(device));
|
||||
}
|
||||
}
|
@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
|
||||
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
|
||||
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
|
||||
/// c=(a+b)*b
|
||||
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
Loading…
x
Reference in New Issue
Block a user