Fix GPU build failures.

2025-07-14 17:11:50 +08:00 · 2025-03-09 17:04:41 -07:00 · 2025-03-09 17:04:41 -07:00 · 952eda443b
commit 952eda443b
parent 6a4a0b66bd
7 changed files with 37 additions and 24 deletions
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@ -210,13 +210,13 @@ namespace half_impl {
 // conversion steps back and forth.
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-  return __hadd(a, b);
+  return __hadd(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
+  return __hmul(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
+  return __hsub(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
  float num = __half2float(a);
@ -224,7 +224,7 @@ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
  return __float2half(num / denom);
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-  return __hneg(a);
+  return __hneg(static_cast<__half>(a));
 }
 EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
  a = a + b;
@ -243,22 +243,22 @@ EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
  return a;
 }
 EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
+  return __heq(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
+  return __hne(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
+  return __hlt(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
+  return __hle(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
+  return __hgt(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
+  return __hge(static_cast<__half>(a), static_cast<__half>(b));
 }
 #else  // Emulate support for half floats
@ -667,15 +667,15 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
 #else  // CUDA SDK < 9.0
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
 }
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
 }
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
 }
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
 #endif
--- a/test/main.h
+++ b/test/main.h
@ -45,6 +45,7 @@
 #include <list>
 #if __cplusplus >= 201103L
 #include <random>
 #include <chrono>
 #ifdef EIGEN_USE_THREADS
 #include <future>
 #endif
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -52,11 +52,14 @@
 #endif
 #ifdef _WIN32
 #define NOMINMAX
 #include <windows.h>
 #elif defined(__APPLE__)
 #include <mach/mach_time.h>
 #include <unistd.h>
 #else
 #include <time.h>
 #include <unistd.h>
 #endif
 #ifdef EIGEN_USE_THREADS
@ -68,7 +71,8 @@
 #include <cuda_runtime.h>
 #if __cplusplus >= 201103L
 #include <atomic>
-#include <unistd.h>
+#include <chrono>
 #include <thread>
 #endif
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -700,7 +700,7 @@ __global__ void EigenConvolutionKernel3D(
  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
  const int num_z_input = last_z - first_z + kernelSizeZ;
-  for (int p = 0; p < numPlanes; ++p) {
+  for (size_t p = 0; p < numPlanes; ++p) {
    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
    const int plane_kernel_offset = 0;
@ -726,9 +726,9 @@ __global__ void EigenConvolutionKernel3D(
      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
          float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
+          for (size_t n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
+            for (size_t m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
+              for (size_t l = 0; l < kernelSizeX; ++l) {
                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
              }
            }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -38,7 +38,7 @@ class StreamInterface {
 };
 static cudaDeviceProp* m_deviceProperties;
-static bool m_devicePropInitialized = false;
+static volatile bool m_devicePropInitialized = false;
 static void initializeDeviceProp() {
  if (!m_devicePropInitialized) {
@ -87,8 +87,12 @@ static void initializeDeviceProp() {
      while (!m_devicePropInitialized) {
 #if __cplusplus >= 201103L
        std::atomic_thread_fence(std::memory_order_acquire);
-#endif
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 #elif defined(_WIN32)
        Sleep(1);
 #else
        sleep(1);
 #endif
      }
    }
  }
@ -214,10 +218,13 @@ struct GpuDevice {
 #ifndef __CUDA_ARCH__
    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
+    EIGEN_ONLY_USED_FOR_DEBUG(err);
    assert(err == cudaSuccess);
 #else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
+    EIGEN_UNUSED_VARIABLE(dst);
    EIGEN_UNUSED_VARIABLE(src);
    EIGEN_UNUSED_VARIABLE(n);
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
 #if EIGEN_HAS_VARIADIC_TEMPLATES
  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
    EIGEN_ONLY_USED_FOR_DEBUG(l);
    eigen_assert(l.size() == 0);
  }
 #endif