From 952eda443bc700a04eef438ef0d278ed0a8366a5 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Sun, 9 Mar 2025 17:04:41 -0700 Subject: [PATCH] Fix GPU build failures. --- Eigen/src/Core/arch/CUDA/Half.h | 26 +++++++++---------- Eigen/src/Core/arch/CUDA/PacketMath.h | 4 +-- test/main.h | 1 + unsupported/Eigen/CXX11/Tensor | 6 ++++- .../CXX11/src/Tensor/TensorConvolution.h | 8 +++--- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 15 ++++++++--- .../Eigen/CXX11/src/util/EmulateArray.h | 1 + 7 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index bee0e613e..aaa97a13b 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -210,13 +210,13 @@ namespace half_impl { // conversion steps back and forth. EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); + return __hadd(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); + return __hmul(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); + return __hsub(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { float num = __half2float(a); @@ -224,7 +224,7 @@ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { return __float2half(num / denom); } EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { - return __hneg(a); + return __hneg(static_cast<__half>(a)); } EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { a = a + b; @@ -243,22 +243,22 @@ EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { return a; } EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { - return __heq(a, b); + return __heq(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { - return __hne(a, b); + return __hne(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { - return __hlt(a, b); + return __hlt(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { - return __hle(a, b); + return __hle(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { - return __hgt(a, b); + return __hgt(static_cast<__half>(a), static_cast<__half>(b)); } EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { - return __hge(a, b); + return __hge(static_cast<__half>(a), static_cast<__half>(b)); } #else // Emulate support for half floats @@ -667,15 +667,15 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen: #else // CUDA SDK < 9.0 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) { - return static_cast(__shfl(static_cast(var), laneMask, width)); + return static_cast(__shfl(static_cast(var), srcLane, width)); } __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) { - return static_cast(__shfl_up(static_cast(var), laneMask, width)); + return static_cast(__shfl_up(static_cast(var), delta, width)); } __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) { - return static_cast(__shfl_down(static_cast(var), laneMask, width)); + return static_cast(__shfl_down(static_cast(var), delta, width)); } __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 084533499..1c24aab2a 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const float4*)from); + return __ldg(reinterpret_cast(from)); #else return make_float4(from[0], from[1], from[2], from[3]); #endif @@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const double2*)from); + return __ldg(reinterpret_cast(from)); #else return make_double2(from[0], from[1]); #endif diff --git a/test/main.h b/test/main.h index 18bb5c825..850d44c6a 100644 --- a/test/main.h +++ b/test/main.h @@ -45,6 +45,7 @@ #include #if __cplusplus >= 201103L #include +#include #ifdef EIGEN_USE_THREADS #include #endif diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 92e49f583..fdab7c58a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -52,11 +52,14 @@ #endif #ifdef _WIN32 +#define NOMINMAX #include #elif defined(__APPLE__) #include +#include #else #include +#include #endif #ifdef EIGEN_USE_THREADS @@ -68,7 +71,8 @@ #include #if __cplusplus >= 201103L #include -#include +#include +#include #endif #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 47089e75a..81cbd4f01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -700,7 +700,7 @@ __global__ void EigenConvolutionKernel3D( const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; const int num_z_input = last_z - first_z + kernelSizeZ; - for (int p = 0; p < numPlanes; ++p) { + for (size_t p = 0; p < numPlanes; ++p) { const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; @@ -726,9 +726,9 @@ __global__ void EigenConvolutionKernel3D( for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { + for (size_t n = 0; n < kernelSizeZ; ++n) { + for (size_t m = 0; m < kernelSizeY; ++m) { + for (size_t l = 0; l < kernelSizeX; ++l) { result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4f5767bc7..51cd7fcde 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -38,7 +38,7 @@ class StreamInterface { }; static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; +static volatile bool m_devicePropInitialized = false; static void initializeDeviceProp() { if (!m_devicePropInitialized) { @@ -87,8 +87,12 @@ static void initializeDeviceProp() { while (!m_devicePropInitialized) { #if __cplusplus >= 201103L std::atomic_thread_fence(std::memory_order_acquire); -#endif + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); +#elif defined(_WIN32) + Sleep(1); +#else sleep(1); +#endif } } } @@ -214,10 +218,13 @@ struct GpuDevice { #ifndef __CUDA_ARCH__ cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) + EIGEN_ONLY_USED_FOR_DEBUG(err); assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 30d3ebcff..94ae91720 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -169,6 +169,7 @@ template class array { #if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { + EIGEN_ONLY_USED_FOR_DEBUG(l); eigen_assert(l.size() == 0); } #endif