Re-enabled the optimized reduction CUDA code.

2025-09-26 16:26:48 +08:00 · 2016-01-11 09:07:14 -08:00 · 2016-01-11 09:07:14 -08:00 · 780623261e
commit 780623261e
parent b557662e58
2 changed files with 3 additions and 5 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -527,7 +527,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
    }

    // Attempt to use an optimized reduction.
-#if 0
    else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
      bool reducing_inner_dims = true;
      for (int i = 0; i < NumReducedDims; ++i) {
@ -563,7 +562,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
        return false;
      }
    }
-#endif
    return true;
  }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
    const int block_size = 256;
    const int num_per_thread = 128;
    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
  }
 };
@ -222,7 +222,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
    const int num_per_thread = 128;
    const int num_blocks = 32;

-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread>),
+    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
                       num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
  }
 };
@ -279,7 +279,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
                           device.maxCudaThreadsPerMultiProcessor() / block_size;
    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);

-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>),
+    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
  }
 };