mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-13 00:21:49 +08:00
Disabled the use of half2 on cuda devices of compute capability < 5.3
This commit is contained in:
parent
8d22967bd9
commit
995f202cea
@ -17,7 +17,8 @@
|
|||||||
// we'll use on the host side (SSE, AVX, ...)
|
// we'll use on the host side (SSE, AVX, ...)
|
||||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
// Most of the following operations require arch >= 5.3
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
@ -67,20 +68,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
|
||||||
#if __CUDA_ARCH__ >= 320
|
|
||||||
return __ldg((const half2*)from);
|
return __ldg((const half2*)from);
|
||||||
#else
|
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
|
||||||
#if __CUDA_ARCH__ >= 320
|
|
||||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||||
#else
|
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
|
template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
|
||||||
@ -113,8 +106,6 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
|||||||
kernel.packet[1] = __halves2half2(a2, b2);
|
kernel.packet[1] = __halves2half2(a2, b2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following operations require arch >= 5.3
|
|
||||||
#if __CUDA_ARCH__ >= 530
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
|
||||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||||
}
|
}
|
||||||
@ -190,7 +181,6 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
|
|||||||
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
|
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
|
||||||
return __hmul(__low2half(a), __high2half(a));
|
return __hmul(__low2half(a), __high2half(a));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
@ -71,6 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> >
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct type_casting_traits<half, float> {
|
struct type_casting_traits<half, float> {
|
||||||
@ -82,22 +83,9 @@ struct type_casting_traits<half, float> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
|
||||||
float2 r1 = __half22float2(a);
|
float2 r1 = __half22float2(a);
|
||||||
float2 r2 = __half22float2(b);
|
float2 r2 = __half22float2(b);
|
||||||
return make_float4(r1.x, r1.y, r2.x, r2.y);
|
return make_float4(r1.x, r1.y, r2.x, r2.y);
|
||||||
#else
|
|
||||||
half r1;
|
|
||||||
r1.x = a.x & 0xFFFF;
|
|
||||||
half r2;
|
|
||||||
r2.x = (a.x & 0xFFFF0000) >> 16;
|
|
||||||
half r3;
|
|
||||||
r3.x = b.x & 0xFFFF;
|
|
||||||
half r4;
|
|
||||||
r4.x = (b.x & 0xFFFF0000) >> 16;
|
|
||||||
return make_float4(static_cast<float>(r1), static_cast<float>(r2),
|
|
||||||
static_cast<float>(r3), static_cast<float>(r4));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -111,19 +99,10 @@ struct type_casting_traits<float, half> {
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
|
template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
|
||||||
// Simply discard the second half of the input
|
// Simply discard the second half of the input
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
|
||||||
return __float22half2_rn(make_float2(a.x, a.y));
|
return __float22half2_rn(make_float2(a.x, a.y));
|
||||||
#else
|
|
||||||
half r1 = static_cast<half>(a.x);
|
|
||||||
half r2 = static_cast<half>(a.y);
|
|
||||||
half2 r;
|
|
||||||
r.x = 0;
|
|
||||||
r.x |= r1.x;
|
|
||||||
r.x |= (static_cast<unsigned int>(r2.x) << 16);
|
|
||||||
return r;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -210,10 +210,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
|||||||
ei_add_test(cxx11_tensor_random_cuda)
|
ei_add_test(cxx11_tensor_random_cuda)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Operations other that casting of half floats are only supported starting with arch 5.3
|
ei_add_test(cxx11_tensor_of_float16_cuda)
|
||||||
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52)
|
|
||||||
ei_add_test(cxx11_tensor_of_float16_cuda)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
endif()
|
endif()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user