Fix GPU build failures.

This commit is contained in:
Antonio Sanchez 2025-03-09 17:04:41 -07:00
parent 6a4a0b66bd
commit 952eda443b
7 changed files with 37 additions and 24 deletions

View File

@ -210,13 +210,13 @@ namespace half_impl {
// conversion steps back and forth. // conversion steps back and forth.
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
return __hadd(a, b); return __hadd(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b); return __hmul(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b); return __hsub(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
float num = __half2float(a); float num = __half2float(a);
@ -224,7 +224,7 @@ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
return __float2half(num / denom); return __float2half(num / denom);
} }
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a); return __hneg(static_cast<__half>(a));
} }
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b; a = a + b;
@ -243,22 +243,22 @@ EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
return a; return a;
} }
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b); return __heq(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b); return __hne(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b); return __hlt(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b); return __hle(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b); return __hgt(static_cast<__half>(a), static_cast<__half>(b));
} }
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b); return __hge(static_cast<__half>(a), static_cast<__half>(b));
} }
#else // Emulate support for half floats #else // Emulate support for half floats
@ -667,15 +667,15 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
#else // CUDA SDK < 9.0 #else // CUDA SDK < 9.0
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
return static_cast<Eigen::half>(__shfl(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
} }
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
} }
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
} }
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {

View File

@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return __ldg((const float4*)from); return __ldg(reinterpret_cast<const float4*>(from));
#else #else
return make_float4(from[0], from[1], from[2], from[3]); return make_float4(from[0], from[1], from[2], from[3]);
#endif #endif
@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return __ldg((const double2*)from); return __ldg(reinterpret_cast<const double2*>(from));
#else #else
return make_double2(from[0], from[1]); return make_double2(from[0], from[1]);
#endif #endif

View File

@ -45,6 +45,7 @@
#include <list> #include <list>
#if __cplusplus >= 201103L #if __cplusplus >= 201103L
#include <random> #include <random>
#include <chrono>
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
#include <future> #include <future>
#endif #endif

View File

@ -52,11 +52,14 @@
#endif #endif
#ifdef _WIN32 #ifdef _WIN32
#define NOMINMAX
#include <windows.h> #include <windows.h>
#elif defined(__APPLE__) #elif defined(__APPLE__)
#include <mach/mach_time.h> #include <mach/mach_time.h>
#include <unistd.h>
#else #else
#include <time.h> #include <time.h>
#include <unistd.h>
#endif #endif
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
@ -68,7 +71,8 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#if __cplusplus >= 201103L #if __cplusplus >= 201103L
#include <atomic> #include <atomic>
#include <unistd.h> #include <chrono>
#include <thread>
#endif #endif
#endif #endif

View File

@ -700,7 +700,7 @@ __global__ void EigenConvolutionKernel3D(
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const int num_z_input = last_z - first_z + kernelSizeZ; const int num_z_input = last_z - first_z + kernelSizeZ;
for (int p = 0; p < numPlanes; ++p) { for (size_t p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0; const int plane_kernel_offset = 0;
@ -726,9 +726,9 @@ __global__ void EigenConvolutionKernel3D(
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f; float result = 0.0f;
for (int n = 0; n < kernelSizeZ; ++n) { for (size_t n = 0; n < kernelSizeZ; ++n) {
for (int m = 0; m < kernelSizeY; ++m) { for (size_t m = 0; m < kernelSizeY; ++m) {
for (int l = 0; l < kernelSizeX; ++l) { for (size_t l = 0; l < kernelSizeX; ++l) {
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
} }
} }

View File

@ -38,7 +38,7 @@ class StreamInterface {
}; };
static cudaDeviceProp* m_deviceProperties; static cudaDeviceProp* m_deviceProperties;
static bool m_devicePropInitialized = false; static volatile bool m_devicePropInitialized = false;
static void initializeDeviceProp() { static void initializeDeviceProp() {
if (!m_devicePropInitialized) { if (!m_devicePropInitialized) {
@ -87,8 +87,12 @@ static void initializeDeviceProp() {
while (!m_devicePropInitialized) { while (!m_devicePropInitialized) {
#if __cplusplus >= 201103L #if __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_acquire); std::atomic_thread_fence(std::memory_order_acquire);
#endif std::this_thread::sleep_for(std::chrono::milliseconds(1000));
#elif defined(_WIN32)
Sleep(1);
#else
sleep(1); sleep(1);
#endif
} }
} }
} }
@ -214,9 +218,12 @@ struct GpuDevice {
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
stream_->stream()); stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_ONLY_USED_FOR_DEBUG(err);
assert(err == cudaSuccess); assert(err == cudaSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(dst);
EIGEN_UNUSED_VARIABLE(src);
EIGEN_UNUSED_VARIABLE(n);
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }

View File

@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
#if EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() { EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
EIGEN_ONLY_USED_FOR_DEBUG(l);
eigen_assert(l.size() == 0); eigen_assert(l.size() == 0);
} }
#endif #endif