mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-14 17:11:50 +08:00
Fix GPU build failures.
This commit is contained in:
parent
6a4a0b66bd
commit
952eda443b
@ -210,13 +210,13 @@ namespace half_impl {
|
|||||||
// conversion steps back and forth.
|
// conversion steps back and forth.
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||||
return __hadd(a, b);
|
return __hadd(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
|
||||||
return __hmul(a, b);
|
return __hmul(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
||||||
return __hsub(a, b);
|
return __hsub(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
||||||
float num = __half2float(a);
|
float num = __half2float(a);
|
||||||
@ -224,7 +224,7 @@ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
|||||||
return __float2half(num / denom);
|
return __float2half(num / denom);
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
|
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
|
||||||
return __hneg(a);
|
return __hneg(static_cast<__half>(a));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
|
||||||
a = a + b;
|
a = a + b;
|
||||||
@ -243,22 +243,22 @@ EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
|
||||||
return __heq(a, b);
|
return __heq(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
|
||||||
return __hne(a, b);
|
return __hne(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
|
||||||
return __hlt(a, b);
|
return __hlt(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
|
||||||
return __hle(a, b);
|
return __hle(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
|
||||||
return __hgt(a, b);
|
return __hgt(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||||
return __hge(a, b);
|
return __hge(static_cast<__half>(a), static_cast<__half>(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // Emulate support for half floats
|
#else // Emulate support for half floats
|
||||||
@ -667,15 +667,15 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
|
|||||||
#else // CUDA SDK < 9.0
|
#else // CUDA SDK < 9.0
|
||||||
|
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
|
||||||
return static_cast<Eigen::half>(__shfl(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
|
||||||
return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
|
||||||
return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
|
||||||
|
@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
|||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||||
return __ldg((const float4*)from);
|
return __ldg(reinterpret_cast<const float4*>(from));
|
||||||
#else
|
#else
|
||||||
return make_float4(from[0], from[1], from[2], from[3]);
|
return make_float4(from[0], from[1], from[2], from[3]);
|
||||||
#endif
|
#endif
|
||||||
@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
|||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||||
return __ldg((const double2*)from);
|
return __ldg(reinterpret_cast<const double2*>(from));
|
||||||
#else
|
#else
|
||||||
return make_double2(from[0], from[1]);
|
return make_double2(from[0], from[1]);
|
||||||
#endif
|
#endif
|
||||||
|
@ -45,6 +45,7 @@
|
|||||||
#include <list>
|
#include <list>
|
||||||
#if __cplusplus >= 201103L
|
#if __cplusplus >= 201103L
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <chrono>
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
#include <future>
|
#include <future>
|
||||||
#endif
|
#endif
|
||||||
|
@ -52,11 +52,14 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
#define NOMINMAX
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
#include <mach/mach_time.h>
|
#include <mach/mach_time.h>
|
||||||
|
#include <unistd.h>
|
||||||
#else
|
#else
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
@ -68,7 +71,8 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#if __cplusplus >= 201103L
|
#if __cplusplus >= 201103L
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <unistd.h>
|
#include <chrono>
|
||||||
|
#include <thread>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -700,7 +700,7 @@ __global__ void EigenConvolutionKernel3D(
|
|||||||
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
|
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
|
||||||
const int num_z_input = last_z - first_z + kernelSizeZ;
|
const int num_z_input = last_z - first_z + kernelSizeZ;
|
||||||
|
|
||||||
for (int p = 0; p < numPlanes; ++p) {
|
for (size_t p = 0; p < numPlanes; ++p) {
|
||||||
|
|
||||||
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||||
const int plane_kernel_offset = 0;
|
const int plane_kernel_offset = 0;
|
||||||
@ -726,9 +726,9 @@ __global__ void EigenConvolutionKernel3D(
|
|||||||
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||||
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
for (int n = 0; n < kernelSizeZ; ++n) {
|
for (size_t n = 0; n < kernelSizeZ; ++n) {
|
||||||
for (int m = 0; m < kernelSizeY; ++m) {
|
for (size_t m = 0; m < kernelSizeY; ++m) {
|
||||||
for (int l = 0; l < kernelSizeX; ++l) {
|
for (size_t l = 0; l < kernelSizeX; ++l) {
|
||||||
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
|
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ class StreamInterface {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static cudaDeviceProp* m_deviceProperties;
|
static cudaDeviceProp* m_deviceProperties;
|
||||||
static bool m_devicePropInitialized = false;
|
static volatile bool m_devicePropInitialized = false;
|
||||||
|
|
||||||
static void initializeDeviceProp() {
|
static void initializeDeviceProp() {
|
||||||
if (!m_devicePropInitialized) {
|
if (!m_devicePropInitialized) {
|
||||||
@ -87,8 +87,12 @@ static void initializeDeviceProp() {
|
|||||||
while (!m_devicePropInitialized) {
|
while (!m_devicePropInitialized) {
|
||||||
#if __cplusplus >= 201103L
|
#if __cplusplus >= 201103L
|
||||||
std::atomic_thread_fence(std::memory_order_acquire);
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
#endif
|
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
Sleep(1);
|
||||||
|
#else
|
||||||
sleep(1);
|
sleep(1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -214,10 +218,13 @@ struct GpuDevice {
|
|||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||||
stream_->stream());
|
stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_ONLY_USED_FOR_DEBUG(err);
|
||||||
assert(err == cudaSuccess);
|
assert(err == cudaSuccess);
|
||||||
#else
|
#else
|
||||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
EIGEN_UNUSED_VARIABLE(dst);
|
||||||
|
EIGEN_UNUSED_VARIABLE(src);
|
||||||
|
EIGEN_UNUSED_VARIABLE(n);
|
||||||
|
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
|
|||||||
|
|
||||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
|
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
|
||||||
|
EIGEN_ONLY_USED_FOR_DEBUG(l);
|
||||||
eigen_assert(l.size() == 0);
|
eigen_assert(l.size() == 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user