Fixing a CUDA / P100 regression introduced by PR 181

PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified.

That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
This commit is contained in:
Deven Desai 2020-08-19 20:06:39 +00:00
parent c060114a25
commit 603e213d13
6 changed files with 40 additions and 21 deletions

View File

@ -440,8 +440,27 @@
// analogous to EIGEN_CUDA_ARCH, but for HIP // analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
#endif #endif
// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
// value to 1024. The compiler assigns a default value of 256 when the attribute is not
// specified. This results in failures on the HIP platform, for cases when a GPU kernel
// without an explicit launch_bounds attribute is called with a threads_per_block value
// greater than 256.
//
// This is a regression in functioanlity and is expected to be fixed within the next
// couple of ROCm releases (compiler will go back to using 1024 value as the default)
//
// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
// attribute.
#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
#endif #endif
#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
// Unify CUDA/HIPCC // Unify CUDA/HIPCC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)

View File

@ -29,7 +29,7 @@ void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
template<typename Kernel, typename Input, typename Output> template<typename Kernel, typename Input, typename Output>
__global__ __global__
__launch_bounds__(1024) EIGEN_HIP_LAUNCH_BOUNDS_1024
void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
{ {
int i = threadIdx.x + blockIdx.x*blockDim.x; int i = threadIdx.x + blockIdx.x*blockDim.x;

View File

@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> {
template <typename InputEvaluator, typename Index, typename InputDims, template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSize> int StaticKernelSize>
__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D( __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
InputEvaluator eval, InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
indexMapper, indexMapper,
@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
template <typename InputEvaluator, typename Index, typename InputDims, template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSizeX, int StaticKernelSizeY> int StaticKernelSizeX, int StaticKernelSizeY>
__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D( __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
InputEvaluator eval, InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
indexMapper, indexMapper,
@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
}; };
template <typename InputEvaluator, typename Index, typename InputDims> template <typename InputEvaluator, typename Index, typename InputDims>
__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D( __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
InputEvaluator eval, InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
indexMapper, indexMapper,

View File

@ -15,7 +15,7 @@
// so we'll use a macro to make clang happy. // so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND #ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024) #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
#else #else
#define KERNEL_FRIEND friend #define KERNEL_FRIEND friend
#endif #endif
@ -427,24 +427,24 @@ struct GenericReducer {
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_> template <int B, int N, typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16) #if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_> template <typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_> template <int B, int N, typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_> template <int NPT, typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif #endif
template <int NPT, typename S, typename R, typename I_> template <int NPT, typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_> template <int NPT, typename S, typename R, typename I_>
__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif #endif
/** /**

View File

@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
template <typename CoeffType, typename Index> template <typename CoeffType, typename Index>
__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x; const Index num_threads = blockDim.x * gridDim.x;
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
@ -132,7 +132,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val,
template <int BlockSize, int NumPerThread, typename Self, template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) { typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value // Initialize the output value
@ -214,7 +214,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, con
#ifdef EIGEN_HAS_GPU_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, template <typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
packet_traits<Eigen::half>::type* scratch) { packet_traits<Eigen::half>::type* scratch) {
eigen_assert(blockDim.x == 1); eigen_assert(blockDim.x == 1);
eigen_assert(gridDim.x == 1); eigen_assert(gridDim.x == 1);
@ -239,7 +239,7 @@ __global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Re
template <typename Self, template <typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x; const Index num_threads = blockDim.x * gridDim.x;
typedef typename packet_traits<Eigen::half>::type PacketType; typedef typename packet_traits<Eigen::half>::type PacketType;
@ -259,7 +259,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer red
template <int BlockSize, int NumPerThread, typename Self, template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
half* output, packet_traits<Eigen::half>::type* scratch) { half* output, packet_traits<Eigen::half>::type* scratch) {
typedef typename packet_traits<Eigen::half>::type PacketType; typedef typename packet_traits<Eigen::half>::type PacketType;
const int packet_width = unpacket_traits<PacketType>::size; const int packet_width = unpacket_traits<PacketType>::size;
@ -358,7 +358,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer red
} }
template <typename Op> template <typename Op>
__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
eigen_assert(threadIdx.x == 1); eigen_assert(threadIdx.x == 1);
half2* pscratch = reinterpret_cast<half2*>(scratch); half2* pscratch = reinterpret_cast<half2*>(scratch);
half tmp = __float2half(0.f); half tmp = __float2half(0.f);
@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
template <int NumPerThread, typename Self, template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) { typename Self::CoeffReturnType* output) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type; typedef typename Self::CoeffReturnType Type;
@ -561,7 +561,7 @@ __global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, co
template <int NumPerThread, typename Self, template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
half* output) { half* output) {
eigen_assert(blockDim.y == 1); eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1); eigen_assert(blockDim.z == 1);
@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
template <int NumPerThread, typename Self, template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) { typename Self::CoeffReturnType* output) {
const Index num_threads = blockDim.x * gridDim.x; const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
// parallel, but it would be better to use a parallel scan algorithm and // parallel, but it would be better to use a parallel scan algorithm and
// optimize memory access. // optimize memory access.
template <typename Self, typename Reducer> template <typename Self, typename Reducer>
__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
// Compute offset as in the CPU version // Compute offset as in the CPU version
Index val = threadIdx.x + blockIdx.x * blockDim.x; Index val = threadIdx.x + blockIdx.x * blockDim.x;
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();