From f6cc359e109fb04c6bb35da026f92da1dfe5c8f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 3 Feb 2023 19:18:45 +0000 Subject: [PATCH] More EIGEN_DEVICE_FUNC fixes for CUDA 10/11/12. --- Eigen/src/Core/arch/GPU/PacketMath.h | 4 +- Eigen/src/Core/util/DisableStupidWarnings.h | 1 + test/gpu_basic.cu | 13 +++--- .../CXX11/src/Tensor/TensorConvolution.h | 42 +++++++++---------- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 11 ++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 +- .../CXX11/src/Tensor/TensorReductionGpu.h | 42 ++++++++++++------- .../Eigen/CXX11/src/Tensor/TensorScan.h | 2 +- 9 files changed, 68 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index e2bcf483a..17dd8fb51 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -379,7 +379,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { #if defined(EIGEN_GPU_HAS_LDG) - return __ldg((const float4*)from); + return __ldg(reinterpret_cast(from)); #else return make_float4(from[0], from[1], from[2], from[3]); #endif @@ -387,7 +387,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { #if defined(EIGEN_GPU_HAS_LDG) - return __ldg((const double2*)from); + return __ldg(reinterpret_cast(from)); #else return make_double2(from[0], from[1]); #endif diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 0865fb698..eed239769 100644 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -121,6 +121,7 @@ // The __device__ annotation seems to actually be needed in some cases, // otherwise resulting in kernel runtime errors. EIGEN_NV_DIAG_SUPPRESS(2886) + EIGEN_NV_DIAG_SUPPRESS(2929) EIGEN_NV_DIAG_SUPPRESS(2977) EIGEN_NV_DIAG_SUPPRESS(20012) #undef EIGEN_NV_DIAG_SUPPRESS diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index 00838ea8e..67f16bf9a 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -456,11 +456,10 @@ EIGEN_DECLARE_TEST(gpu_basic) // numeric_limits CALL_SUBTEST( test_with_infs_nans(numeric_limits_test(), 1, in, out) ); -#if defined(__NVCC__) - // FIXME - // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda - CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); - typedef Matrix Matrix6f; - CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); -#endif + // These tests require dynamic-sized matrix multiplcation, which isn't currently + // supported on GPU. + + // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); + // typedef Matrix Matrix6f; + // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index e6e586b7b..158d250f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( #endif // Load inputs to shared memory - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; + const size_t first_x = blockIdx.x * maxX; + const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const size_t num_x_input = last_x - first_x + kernelSizeX; - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; + const size_t first_y = blockIdx.y * maxY; + const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const size_t num_y_input = last_y - first_y + kernelSizeY; - const int first_z = blockIdx.z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; + const size_t first_z = blockIdx.z * maxZ; + const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const size_t num_z_input = last_z - first_z + kernelSizeZ; for (int p = 0; p < numPlanes; ++p) { const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; - for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) { const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } @@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( __syncthreads(); // Convolution - const int num_z_output = last_z - first_z + 1; - const int num_y_output = last_y - first_y + 1; - const int num_x_output = last_x - first_x + 1; + const size_t num_z_output = last_z - first_z + 1; + const size_t num_y_output = last_y - first_y + 1; + const size_t num_x_output = last_x - first_x + 1; const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); - for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) { float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { + for (size_t n = 0; n < kernelSizeZ; ++n) { + for (size_t m = 0; m < kernelSizeY; ++m) { + for (size_t l = 0; l < kernelSizeX; ++l) { result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 8ea1bf0d4..b47790704 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -378,7 +378,7 @@ struct GpuDevice { return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; + return static_cast(stream_->deviceProperties().sharedMemPerBlock); } EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 2bd94c308..f8e3f2981 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -65,7 +65,8 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get((const_cast(m.data())))), m_dims(m.dimensions()), m_device(device) @@ -263,7 +264,8 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } @@ -358,6 +360,7 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -455,6 +458,7 @@ struct TensorEvaluator, Device> RawAccess = false }; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_functor(op.functor()), @@ -571,6 +575,7 @@ struct TensorEvaluator RawAccess = false }; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f961b4066..92d04f690 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -94,9 +94,8 @@ class TensorExecutor { "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " "EIGEN_USE_SYCL before including Eigen headers."); - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Device& device = Device()) { + const Device& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { @@ -126,7 +125,6 @@ class TensorExecutor evaluator(expr, device); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 51cdf443f..600c2b0ff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -52,7 +52,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) return; } unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + while ((readback = atomicCAS(reinterpret_cast(output), oldval, newval)) != oldval) { oldval = readback; newval = oldval; reducer.reduce(accum, reinterpret_cast(&newval)); @@ -65,6 +65,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) gpu_assert(0 && "Wordsize not supported"); } #else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + EIGEN_UNUSED_VARIABLE(reducer); gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -118,6 +121,8 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer= 300) atomicAdd(output, accum); #else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -209,6 +214,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer #endif } #else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(semaphore); gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif // EIGEN_CUDA_ARCH >= 300 } @@ -243,7 +253,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; typedef typename packet_traits::type PacketType; @@ -715,11 +725,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc half2* hr2 = reinterpret_cast(&r2); half2* rr1 = reinterpret_cast(&reduced_val1); half2* rr2 = reinterpret_cast(&reduced_val2); - for (int i = 0; i < packet_width / 2; i++) { - hr1[i] = - __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize); - hr2[i] = - __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize); + for (int j = 0; j < packet_width / 2; j++) { + hr1[j] = + __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize); + hr2[j] = + __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize); } reducer.reducePacket(r1, &reduced_val1); reducer.reducePacket(r2, &reduced_val2); @@ -744,7 +754,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc val = __halves2half2(val1, val2); if ((threadIdx.x & (warpSize - 1)) == 0) { half* loc = output + row; - atomicReduce((half2*)loc, val, reducer); + atomicReduce(reinterpret_cast(loc), val, reducer); } } } @@ -782,12 +792,12 @@ struct InnerReductionLauncher< if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumGpuMultiProcessors() * + const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); LAUNCH_GPU_KERNEL((ReductionInitKernel), - num_blocks, 1024, 0, device, reducer.initialize(), + num_blocks2, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -950,12 +960,12 @@ struct OuterReducer { if (num_blocks > 1) { // We initialize the outputs in the reduction kernel itself when we don't have to worry // about race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumGpuMultiProcessors() * + const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); LAUNCH_GPU_KERNEL((ReductionInitKernel), - num_blocks, 1024, 0, device, reducer.initialize(), + num_blocks2, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index ed0a731aa..2c574c79f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -191,7 +191,7 @@ template ::PacketAccess && internal::reducer_traits::PacketAccess)> struct ScanLauncher { - void operator()(Self& self, typename Self::CoeffReturnType* data) { + void operator()(Self& self, typename Self::CoeffReturnType* data) const { Index total_size = internal::array_prod(self.dimensions()); // We fix the index along the scan axis to 0 and perform a