Fixing a CUDA / P100 regression introduced by PR 181

PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
2025-07-03 11:45:12 +08:00 · 2020-08-19 20:06:39 +00:00 · 2020-08-19 20:06:39 +00:00 · 603e213d13
commit 603e213d13
parent c060114a25
6 changed files with 40 additions and 21 deletions
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -440,8 +440,27 @@
    // analogous to EIGEN_CUDA_ARCH, but for HIP
    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
  #endif
+
+  // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+  // value to 1024. The compiler assigns a default value of 256 when the attribute is not
+  // specified. This results in failures on the HIP platform, for cases when a GPU kernel
+  // without an explicit launch_bounds attribute is called with a threads_per_block value
+  // greater than 256.
+  //
+  // This is a regression in functioanlity and is expected to be fixed within the next
+  // couple of ROCm releases (compiler will go back to using 1024 value as the default)
+  //
+  // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+  // attribute.
+
+  #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
 #endif

+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
 // Unify CUDA/HIPCC

 #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@ -29,7 +29,7 @@ void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)

 template<typename Kernel, typename Input, typename Output>
 __global__
-__launch_bounds__(1024)
+EIGEN_HIP_LAUNCH_BOUNDS_1024
 void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
 {
  int i = threadIdx.x + blockIdx.x*blockDim.x;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> {

 template <typename InputEvaluator, typename Index, typename InputDims,
          int StaticKernelSize>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
    InputEvaluator eval,
    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
        indexMapper,
@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(

 template <typename InputEvaluator, typename Index, typename InputDims,
          int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
    InputEvaluator eval,
    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
        indexMapper,
@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
 };

 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
    InputEvaluator eval,
    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
        indexMapper,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -15,7 +15,7 @@
 // so we'll use a macro to make clang happy.
 #ifndef KERNEL_FRIEND
 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024)
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
 #else
 #define KERNEL_FRIEND friend
 #endif
@ -427,24 +427,24 @@ struct GenericReducer {

 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);


 #if defined(EIGEN_HAS_GPU_FP16)
 template <typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
 template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);

 #endif

 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);

 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif

 /**
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float


 template <typename CoeffType, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
  const Index num_threads = blockDim.x * gridDim.x;
  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
@ -132,7 +132,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val,

 template <int BlockSize, int NumPerThread, typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  // Initialize the output value
@ -214,7 +214,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, con
 #ifdef EIGEN_HAS_GPU_FP16
 template <typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                                      packet_traits<Eigen::half>::type* scratch) {
  eigen_assert(blockDim.x == 1);
  eigen_assert(gridDim.x == 1);
@ -239,7 +239,7 @@ __global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Re

 template <typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
  const Index num_threads = blockDim.x * gridDim.x;
  typedef typename packet_traits<Eigen::half>::type PacketType;
@ -259,7 +259,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer red

 template <int BlockSize, int NumPerThread, typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                    half* output, packet_traits<Eigen::half>::type* scratch) {
  typedef typename packet_traits<Eigen::half>::type PacketType;
  const int packet_width = unpacket_traits<PacketType>::size;
@ -358,7 +358,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer red
 }

 template <typename Op>
-__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
  eigen_assert(threadIdx.x == 1);
  half2* pscratch = reinterpret_cast<half2*>(scratch);
  half tmp = __float2half(0.f);
@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {

 template <int NumPerThread, typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                         typename Self::CoeffReturnType* output) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  typedef typename Self::CoeffReturnType Type;
@ -561,7 +561,7 @@ __global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, co

 template <int NumPerThread, typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                              half* output) {
  eigen_assert(blockDim.y == 1);
  eigen_assert(blockDim.z == 1);
@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> {

 template <int NumPerThread, typename Self,
          typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                     typename Self::CoeffReturnType* output) {
  const Index num_threads = blockDim.x * gridDim.x;
  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
 // parallel, but it would be better to use a parallel scan algorithm and
 // optimize memory access.
 template <typename Self, typename Reducer>
-__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
  // Compute offset as in the CPU version
  Index val = threadIdx.x + blockIdx.x * blockDim.x;
  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();