mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-01 16:24:28 +08:00
Make Eigen build with cuda 10 and clang.
This commit is contained in:
parent
c8d8d5c0fc
commit
734a50dc60
@ -239,13 +239,17 @@ namespace Eigen {
|
|||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
|
||||||
|
(defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
|
||||||
|
#define __EIGEN_NATIVE_FP16
|
||||||
|
#endif
|
||||||
|
|
||||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||||
// versions to get the ALU speed increased), but you do save the
|
// versions to get the ALU speed increased), but you do save the
|
||||||
// conversion steps back and forth.
|
// conversion steps back and forth.
|
||||||
|
|
||||||
|
#if defined(__EIGEN_NATIVE_FP16)
|
||||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
return __hadd(::__half(a), ::__half(b));
|
return __hadd(::__half(a), ::__half(b));
|
||||||
@ -306,7 +310,20 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
|||||||
return __hge(a, b);
|
return __hge(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // Emulate support for half floats
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__EIGEN_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
|
||||||
|
|
||||||
|
#if defined(__clang__) && defined(__CUDA__)
|
||||||
|
// We need to provide emulated *host-side* FP16 operators for clang.
|
||||||
|
#pragma push_macro("EIGEN_DEVICE_FUNC")
|
||||||
|
#undef EIGEN_DEVICE_FUNC
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
#define EIGEN_DEVICE_FUNC __host__
|
||||||
|
#else // both host and device need emulated ops.
|
||||||
|
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
|
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
|
||||||
// to/from fp32.
|
// to/from fp32.
|
||||||
@ -363,6 +380,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
|
|||||||
return float(a) >= float(b);
|
return float(a) >= float(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__clang__) && defined(__CUDA__)
|
||||||
|
#pragma pop_macro("EIGEN_DEVICE_FUNC")
|
||||||
|
#endif
|
||||||
#endif // Emulate support for half floats
|
#endif // Emulate support for half floats
|
||||||
|
|
||||||
// Division by an index. Do it in full float precision to avoid accuracy
|
// Division by an index. Do it in full float precision to avoid accuracy
|
||||||
|
@ -16,7 +16,8 @@ namespace internal {
|
|||||||
|
|
||||||
// Most of the following operations require arch >= 3.0
|
// Most of the following operations require arch >= 3.0
|
||||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
|
||||||
|
(defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
|
||||||
|
|
||||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||||
|
|
||||||
@ -45,7 +46,14 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
|||||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
|
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||||
|
#if !defined(EIGEN_CUDA_ARCH)
|
||||||
|
half2 r;
|
||||||
|
r.x = from;
|
||||||
|
r.y = from;
|
||||||
|
return r;
|
||||||
|
#else
|
||||||
return __half2half2(from);
|
return __half2half2(from);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||||
|
@ -395,11 +395,8 @@
|
|||||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
|
#if defined(CUDA_VERSION)
|
||||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
#define EIGEN_CUDACC_VER (CUDA_VERSION*10)
|
||||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
|
||||||
#elif defined(__CUDACC_VER__)
|
|
||||||
#define EIGEN_CUDACC_VER __CUDACC_VER__
|
|
||||||
#else
|
#else
|
||||||
#define EIGEN_CUDACC_VER 0
|
#define EIGEN_CUDACC_VER 0
|
||||||
#endif
|
#endif
|
||||||
|
@ -674,10 +674,6 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
|||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||||
// won't be a race conditions between multiple thread blocks.
|
// won't be a race conditions between multiple thread blocks.
|
||||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
|
||||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
|
||||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
|
||||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
|
||||||
LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||||
1, 1, 0, device, reducer, self, num_preserved_vals, output);
|
1, 1, 0, device, reducer, self, num_preserved_vals, output);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user