Fixing TensorReductionSycl for min and max.

2025-09-26 08:13:13 +08:00 · 2017-02-06 18:05:23 +00:00 · 2017-02-06 18:05:23 +00:00 · 42bd5c4e7b
commit 42bd5c4e7b
parent bc128f9f3b
4 changed files with 33 additions and 18 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -26,10 +26,10 @@ namespace Eigen {
 namespace internal {

 template<typename CoeffReturnType> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+template<typename OP, typename BufferTOut, typename BufferTIn>
+static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
  do {
-          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+          auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                    cl::sycl::range<1>{std::min(length, local)}};
            /* Two accessors are used: one to the buffer that is being reduced,
@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de

            /* The parallel_for invocation chosen is the variant with an nd_item
             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch,  length, local));
+            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch,  length, local));
          };
            dev.sycl_queue().submit(f);
            dev.asynchronousExec();
@ -123,7 +123,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    auto out_buffer =dev.get_sycl_buffer(output);
    /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+    syclGenericBufferReducer<CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
  }

 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@ -35,7 +35,7 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {

-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;


 /// This struct is used for special expression nodes with no operations (for example assign and selectOP).
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@ -18,13 +18,14 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {

-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+    OP op;
    OutputAccessor aOut;
    InputAccessor aI;
    LocalAccessor scratch;
    size_t length, local;
-    GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+    GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+    : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
    void operator()(cl::sycl::nd_item<1> itemID) {
      size_t globalid = itemID.get_global(0);
      size_t localid = itemID.get_local(0);
@ -44,7 +45,12 @@ namespace internal {
        auto min = (length < local) ? length : local;
        for (size_t offset = min / 2; offset > 0; offset /= 2) {
          if (localid < offset) {
-            scratch[localid] += scratch[localid + offset];
+            auto accum = op.initialize();
+            op.reduce(scratch[localid], &accum);
+            op.reduce(scratch[localid + offset], &accum);
+            op.finalize(accum);
+            scratch[localid]=accum;
+            //scratch[localid] += scratch[localid + offset];
          }
          itemID.barrier(cl::sycl::access::fence_space::local_space);
        }
@ -131,11 +137,21 @@ public:
    if(globalid<rng)
      tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
    else
-      tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+      tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(op.initialize());

-    if(remaining!=0 && globalid==0 )
+    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+    //  tmp_global_accessor.get_pointer()[0]+=
+      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
+      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+      auto accum = op.initialize();
+      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+      op.reduce(remaining_reduce, &accum);
+      op.finalize(accum);
+      tmp_global_accessor.get_pointer()[0]=accum;
+
+
+    }
  }
 };

--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@ -34,7 +34,7 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {

  in.setRandom();

-  full_redux = in.sum();
+  full_redux = in.minimum();

  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
@ -43,11 +43,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum();
+  out_gpu.device(sycl_device) = in_gpu.minimum();
  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
-
  sycl_device.deallocate(gpu_in_data);
  sycl_device.deallocate(gpu_out_data);
 }
@ -69,7 +68,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)

  in.setRandom();

-  redux= in.sum(red_axis);
+  redux= in.maximum(red_axis);

  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
@ -78,7 +77,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));

  // Check that the CPU and GPU reductions return the same result.