From 952eda443bc700a04eef438ef0d278ed0a8366a5 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Sun, 9 Mar 2025 17:04:41 -0700
Subject: [PATCH] Fix GPU build failures.

---
 Eigen/src/Core/arch/CUDA/Half.h               | 26 +++++++++----------
 Eigen/src/Core/arch/CUDA/PacketMath.h         |  4 +--
 test/main.h                                   |  1 +
 unsupported/Eigen/CXX11/Tensor                |  6 ++++-
 .../CXX11/src/Tensor/TensorConvolution.h      |  8 +++---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 15 ++++++++---
 .../Eigen/CXX11/src/util/EmulateArray.h       |  1 +
 7 files changed, 37 insertions(+), 24 deletions(-)
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index bee0e613e..aaa97a13b 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -210,13 +210,13 @@ namespace half_impl {
 // conversion steps back and forth.
 
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-  return __hadd(a, b);
+  return __hadd(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
+  return __hmul(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
+  return __hsub(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
   float num = __half2float(a);
@@ -224,7 +224,7 @@ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
   return __float2half(num / denom);
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-  return __hneg(a);
+  return __hneg(static_cast<__half>(a));
 }
 EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
   a = a + b;
@@ -243,22 +243,22 @@ EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
   return a;
 }
 EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
+  return __heq(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
+  return __hne(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
+  return __hlt(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
+  return __hle(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
+  return __hgt(static_cast<__half>(a), static_cast<__half>(b));
 }
 EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
+  return __hge(static_cast<__half>(a), static_cast<__half>(b));
 }
 
 #else  // Emulate support for half floats
@@ -667,15 +667,15 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
 #else  // CUDA SDK < 9.0
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
 }
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
 }
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
-  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), laneMask, width));
+  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
 }
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 084533499..1c24aab2a 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
   return make_double2(from[0], from[1]);
 #endif
diff --git a/test/main.h b/test/main.h
index 18bb5c825..850d44c6a 100644
--- a/test/main.h
+++ b/test/main.h
@@ -45,6 +45,7 @@
 #include <list>
 #if __cplusplus >= 201103L
 #include <random>
+#include <chrono>
 #ifdef EIGEN_USE_THREADS
 #include <future>
 #endif
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 92e49f583..fdab7c58a 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -52,11 +52,14 @@
 #endif
 
 #ifdef _WIN32
+#define NOMINMAX
 #include <windows.h>
 #elif defined(__APPLE__)
 #include <mach/mach_time.h>
+#include <unistd.h>
 #else
 #include <time.h>
+#include <unistd.h>
 #endif
 
 #ifdef EIGEN_USE_THREADS
@@ -68,7 +71,8 @@
 #include <cuda_runtime.h>
 #if __cplusplus >= 201103L
 #include <atomic>
-#include <unistd.h>
+#include <chrono>
+#include <thread>
 #endif
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 47089e75a..81cbd4f01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -700,7 +700,7 @@ __global__ void EigenConvolutionKernel3D(
   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
   const int num_z_input = last_z - first_z + kernelSizeZ;
 
-  for (int p = 0; p < numPlanes; ++p) {
+  for (size_t p = 0; p < numPlanes; ++p) {
 
     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
@@ -726,9 +726,9 @@ __global__ void EigenConvolutionKernel3D(
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
           float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
+          for (size_t n = 0; n < kernelSizeZ; ++n) {
+            for (size_t m = 0; m < kernelSizeY; ++m) {
+              for (size_t l = 0; l < kernelSizeX; ++l) {
                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
               }
             }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4f5767bc7..51cd7fcde 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -38,7 +38,7 @@ class StreamInterface {
 };
 
 static cudaDeviceProp* m_deviceProperties;
-static bool m_devicePropInitialized = false;
+static volatile bool m_devicePropInitialized = false;
 
 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
@@ -87,8 +87,12 @@ static void initializeDeviceProp() {
       while (!m_devicePropInitialized) {
 #if __cplusplus >= 201103L
         std::atomic_thread_fence(std::memory_order_acquire);
-#endif
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+#elif defined(_WIN32)
+        Sleep(1);
+#else
         sleep(1);
+#endif
       }
     }
   }
@@ -214,10 +218,13 @@ struct GpuDevice {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
                                       stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
+    EIGEN_ONLY_USED_FOR_DEBUG(err);
     assert(err == cudaSuccess);
 #else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 30d3ebcff..94ae91720 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_ONLY_USED_FOR_DEBUG(l);
     eigen_assert(l.size() == 0);
   }
 #endif