mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Fixing TensorReductionSycl for min and max.
This commit is contained in:
parent
bc128f9f3b
commit
42bd5c4e7b
@ -26,10 +26,10 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType> struct syclGenericBufferReducer{
|
||||
template<typename BufferTOut, typename BufferTIn>
|
||||
static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
template<typename OP, typename BufferTOut, typename BufferTIn>
|
||||
static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
do {
|
||||
auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||
auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
|
||||
cl::sycl::range<1>{std::min(length, local)}};
|
||||
/* Two accessors are used: one to the buffer that is being reduced,
|
||||
@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
|
||||
|
||||
/* The parallel_for invocation chosen is the variant with an nd_item
|
||||
* parameter, since the code requires barriers for correctness. */
|
||||
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local));
|
||||
h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
|
||||
};
|
||||
dev.sycl_queue().submit(f);
|
||||
dev.asynchronousExec();
|
||||
@ -123,7 +123,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
auto out_buffer =dev.get_sycl_buffer(output);
|
||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||
syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
syclGenericBufferReducer<CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -35,7 +35,7 @@ namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
|
||||
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
|
||||
|
||||
|
||||
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
|
||||
|
@ -18,13 +18,14 @@ namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
|
||||
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
|
||||
OP op;
|
||||
OutputAccessor aOut;
|
||||
InputAccessor aI;
|
||||
LocalAccessor scratch;
|
||||
size_t length, local;
|
||||
GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
|
||||
: aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
|
||||
GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
|
||||
: op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
size_t globalid = itemID.get_global(0);
|
||||
size_t localid = itemID.get_local(0);
|
||||
@ -44,7 +45,12 @@ namespace internal {
|
||||
auto min = (length < local) ? length : local;
|
||||
for (size_t offset = min / 2; offset > 0; offset /= 2) {
|
||||
if (localid < offset) {
|
||||
scratch[localid] += scratch[localid + offset];
|
||||
auto accum = op.initialize();
|
||||
op.reduce(scratch[localid], &accum);
|
||||
op.reduce(scratch[localid + offset], &accum);
|
||||
op.finalize(accum);
|
||||
scratch[localid]=accum;
|
||||
//scratch[localid] += scratch[localid + offset];
|
||||
}
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
}
|
||||
@ -131,11 +137,21 @@ public:
|
||||
if(globalid<rng)
|
||||
tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
|
||||
else
|
||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
|
||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(op.initialize());
|
||||
|
||||
if(remaining!=0 && globalid==0 )
|
||||
if(remaining!=0 && globalid==0 ){
|
||||
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||
tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
||||
// tmp_global_accessor.get_pointer()[0]+=
|
||||
auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
|
||||
reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
||||
auto accum = op.initialize();
|
||||
op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
|
||||
op.reduce(remaining_reduce, &accum);
|
||||
op.finalize(accum);
|
||||
tmp_global_accessor.get_pointer()[0]=accum;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -34,7 +34,7 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
||||
|
||||
in.setRandom();
|
||||
|
||||
full_redux = in.sum();
|
||||
full_redux = in.minimum();
|
||||
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
|
||||
@ -43,11 +43,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
||||
TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
|
||||
out_gpu.device(sycl_device) = in_gpu.sum();
|
||||
out_gpu.device(sycl_device) = in_gpu.minimum();
|
||||
sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
|
||||
|
||||
sycl_device.deallocate(gpu_in_data);
|
||||
sycl_device.deallocate(gpu_out_data);
|
||||
}
|
||||
@ -69,7 +68,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
|
||||
|
||||
in.setRandom();
|
||||
|
||||
redux= in.sum(red_axis);
|
||||
redux= in.maximum(red_axis);
|
||||
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
|
||||
@ -78,7 +77,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
|
||||
TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
|
||||
out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
|
||||
out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
|
||||
sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
|
Loading…
x
Reference in New Issue
Block a user