mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-21 00:59:36 +08:00
Added support for CUDA 9.0.
This commit is contained in:
parent
304ef29571
commit
a4089991eb
@ -22,6 +22,7 @@
|
|||||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
|
||||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||||
#elif defined(__CUDACC_VER__)
|
#elif defined(__CUDACC_VER__)
|
||||||
|
@ -50,38 +50,45 @@ struct half;
|
|||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
// Make our own __half_raw definition that is similar to CUDA's.
|
||||||
// Make our own __half definition that is similar to CUDA's.
|
struct __half_raw {
|
||||||
struct __half {
|
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||||
EIGEN_DEVICE_FUNC __half() : x(0) {}
|
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||||
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
|
|
||||||
unsigned short x;
|
unsigned short x;
|
||||||
};
|
};
|
||||||
|
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
|
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||||
|
typedef __half __half_raw;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
|
||||||
|
|
||||||
struct half_base : public __half {
|
struct half_base : public __half_raw {
|
||||||
EIGEN_DEVICE_FUNC half_base() {}
|
EIGEN_DEVICE_FUNC half_base() {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
|
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
|
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
|
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace half_impl
|
} // namespace half_impl
|
||||||
|
|
||||||
// Class definition.
|
// Class definition.
|
||||||
struct half : public half_impl::half_base {
|
struct half : public half_impl::half_base {
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||||
typedef half_impl::__half __half;
|
typedef half_impl::__half_raw __half_raw;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half() {}
|
EIGEN_DEVICE_FUNC half() {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
|
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||||
@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
|
|||||||
// these in hardware. If we need more performance on older/other CPUs, they are
|
// these in hardware. If we need more performance on older/other CPUs, they are
|
||||||
// also possible to vectorize directly.
|
// also possible to vectorize directly.
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
|
||||||
__half h;
|
__half_raw h;
|
||||||
h.x = x;
|
h.x = x;
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
@ -280,12 +287,13 @@ union FP32 {
|
|||||||
float f;
|
float f;
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __float2half(ff);
|
__half tmp_ff = __float2half(ff);
|
||||||
|
return *(__half_raw*)&tmp_ff;
|
||||||
|
|
||||||
#elif defined(EIGEN_HAS_FP16_C)
|
#elif defined(EIGEN_HAS_FP16_C)
|
||||||
__half h;
|
__half_raw h;
|
||||||
h.x = _cvtss_sh(ff, 0);
|
h.x = _cvtss_sh(ff, 0);
|
||||||
return h;
|
return h;
|
||||||
|
|
||||||
@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||||||
const FP32 f16max = { (127 + 16) << 23 };
|
const FP32 f16max = { (127 + 16) << 23 };
|
||||||
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||||
unsigned int sign_mask = 0x80000000u;
|
unsigned int sign_mask = 0x80000000u;
|
||||||
__half o;
|
__half_raw o;
|
||||||
o.x = static_cast<unsigned short>(0x0u);
|
o.x = static_cast<unsigned short>(0x0u);
|
||||||
|
|
||||||
unsigned int sign = f.u & sign_mask;
|
unsigned int sign = f.u & sign_mask;
|
||||||
@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __half2float(h);
|
return __half2float(h);
|
||||||
|
|
||||||
@ -612,11 +620,15 @@ struct hash<Eigen::half> {
|
|||||||
// Add the missing shfl_xor intrinsic
|
// Add the missing shfl_xor intrinsic
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||||
|
#if EIGEN_CUDACC_VER < 90000
|
||||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||||
|
#else
|
||||||
|
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||||
return Eigen::half_impl::raw_uint16_to_half(
|
return Eigen::half_impl::raw_uint16_to_half(
|
||||||
|
@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
|
|||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||||
half2 result;
|
half2 result;
|
||||||
result.x = a.x & 0x7FFF7FFF;
|
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
|
||||||
|
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ using Eigen::half;
|
|||||||
|
|
||||||
void test_conversion()
|
void test_conversion()
|
||||||
{
|
{
|
||||||
using Eigen::half_impl::__half;
|
using Eigen::half_impl::__half_raw;
|
||||||
|
|
||||||
// Conversion from float.
|
// Conversion from float.
|
||||||
VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
|
||||||
@ -37,9 +37,9 @@ void test_conversion()
|
|||||||
VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
|
VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
|
||||||
|
|
||||||
// Verify round-to-nearest-even behavior.
|
// Verify round-to-nearest-even behavior.
|
||||||
float val1 = float(half(__half(0x3c00)));
|
float val1 = float(half(__half_raw(0x3c00)));
|
||||||
float val2 = float(half(__half(0x3c01)));
|
float val2 = float(half(__half_raw(0x3c01)));
|
||||||
float val3 = float(half(__half(0x3c02)));
|
float val3 = float(half(__half_raw(0x3c02)));
|
||||||
VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
|
||||||
VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
|
VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
|
||||||
|
|
||||||
@ -55,21 +55,21 @@ void test_conversion()
|
|||||||
VERIFY_IS_EQUAL(half(true).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(true).x, 0x3c00);
|
||||||
|
|
||||||
// Conversion to float.
|
// Conversion to float.
|
||||||
VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
|
VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
|
||||||
VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
|
VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f);
|
||||||
|
|
||||||
// Denormals.
|
// Denormals.
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f);
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f);
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f);
|
||||||
|
|
||||||
// NaNs and infinities.
|
// NaNs and infinities.
|
||||||
VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
|
VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
|
||||||
VERIFY(!(numext::isnan)(float(half(0.0f))));
|
VERIFY(!(numext::isnan)(float(half(0.0f))));
|
||||||
VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
|
VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
|
||||||
VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
|
VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
|
||||||
VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
|
VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
|
||||||
VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
|
VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
|
||||||
|
|
||||||
#if !EIGEN_COMP_MSVC
|
#if !EIGEN_COMP_MSVC
|
||||||
// Visual Studio errors out on divisions by 0
|
// Visual Studio errors out on divisions by 0
|
||||||
@ -79,12 +79,12 @@ void test_conversion()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Exactly same checks as above, just directly on the half representation.
|
// Exactly same checks as above, just directly on the half representation.
|
||||||
VERIFY(!(numext::isinf)(half(__half(0x7bff))));
|
VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
|
||||||
VERIFY(!(numext::isnan)(half(__half(0x0000))));
|
VERIFY(!(numext::isnan)(half(__half_raw(0x0000))));
|
||||||
VERIFY((numext::isinf)(half(__half(0xfc00))));
|
VERIFY((numext::isinf)(half(__half_raw(0xfc00))));
|
||||||
VERIFY((numext::isnan)(half(__half(0xfc01))));
|
VERIFY((numext::isnan)(half(__half_raw(0xfc01))));
|
||||||
VERIFY((numext::isinf)(half(__half(0x7c00))));
|
VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
|
||||||
VERIFY((numext::isnan)(half(__half(0x7c01))));
|
VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
|
||||||
|
|
||||||
#if !EIGEN_COMP_MSVC
|
#if !EIGEN_COMP_MSVC
|
||||||
// Visual Studio errors out on divisions by 0
|
// Visual Studio errors out on divisions by 0
|
||||||
|
@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
// the sum across all big k blocks of the product of little k block of index (x, y)
|
// the sum across all big k blocks of the product of little k block of index (x, y)
|
||||||
// with block of index (y, z). To compute the final output, we need to reduce
|
// with block of index (y, z). To compute the final output, we need to reduce
|
||||||
// the 8 threads over y by summation.
|
// the 8 threads over y by summation.
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
||||||
|
#else
|
||||||
|
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define reduceRow(i, mask) \
|
#define reduceRow(i, mask) \
|
||||||
shuffleInc(i, 0, mask); \
|
shuffleInc(i, 0, mask); \
|
||||||
@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
x1 = rhs_pf0.x;
|
x1 = rhs_pf0.x;
|
||||||
x2 = rhs_pf0.z;
|
x2 = rhs_pf0.z;
|
||||||
}
|
}
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
x1 = __shfl_xor(x1, 4);
|
x1 = __shfl_xor(x1, 4);
|
||||||
x2 = __shfl_xor(x2, 4);
|
x2 = __shfl_xor(x2, 4);
|
||||||
|
#else
|
||||||
|
x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
|
||||||
|
x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
|
||||||
|
#endif
|
||||||
if((threadIdx.x%8) < 4) {
|
if((threadIdx.x%8) < 4) {
|
||||||
rhs_pf0.y = x1;
|
rhs_pf0.y = x1;
|
||||||
rhs_pf0.w = x2;
|
rhs_pf0.w = x2;
|
||||||
|
@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
|||||||
else {
|
else {
|
||||||
assert(0 && "Wordsize not supported");
|
assert(0 && "Wordsize not supported");
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
// We extend atomicExch to support extra data types
|
// We extend atomicExch to support extra data types
|
||||||
@ -104,9 +104,9 @@ template <>
|
|||||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||||
#if EIGEN_CUDA_ARCH >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
atomicAdd(output, accum);
|
atomicAdd(output, accum);
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
|
#else
|
||||||
|
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
// Let the last block reset the semaphore
|
// Let the last block reset the semaphore
|
||||||
atomicInc(semaphore, gridDim.x + 1);
|
atomicInc(semaphore, gridDim.x + 1);
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
||||||
|
|
||||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||||
if (gridDim.x == 1 && first_index == 0) {
|
if (gridDim.x == 1) {
|
||||||
if (num_coeffs % 2 != 0) {
|
if (first_index == 0) {
|
||||||
half last = input.m_impl.coeff(num_coeffs-1);
|
if (num_coeffs % 2 != 0) {
|
||||||
*scratch = __halves2half2(last, reducer.initialize());
|
half last = input.m_impl.coeff(num_coeffs-1);
|
||||||
} else {
|
*scratch = __halves2half2(last, reducer.initialize());
|
||||||
*scratch = reducer.template initializePacket<half2>();
|
} else {
|
||||||
|
*scratch = reducer.template initializePacket<half2>();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
|
#else
|
||||||
|
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp), &accum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
atomicReduce(scratch, accum, reducer);
|
atomicReduce(scratch, accum, reducer);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
if (gridDim.x == 1) {
|
||||||
|
__syncthreads();
|
||||||
if (gridDim.x == 1 && first_index == 0) {
|
if (first_index == 0) {
|
||||||
half tmp = __low2half(*scratch);
|
half tmp = __low2half(*scratch);
|
||||||
reducer.reduce(__high2half(*scratch), &tmp);
|
reducer.reduce(__high2half(*scratch), &tmp);
|
||||||
*output = tmp;
|
*output = tmp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||||
|
#else
|
||||||
|
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
||||||
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
||||||
|
#else
|
||||||
|
int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
|
||||||
|
int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
half val1 = __low2half(reduced_val1);
|
half val1 = __low2half(reduced_val1);
|
||||||
|
@ -15,6 +15,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template <int Layout>
|
template <int Layout>
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_conversion() {
|
void test_cuda_conversion() {
|
||||||
|
@ -14,6 +14,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_nullary() {
|
void test_cuda_nullary() {
|
||||||
|
@ -14,6 +14,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -17,6 +17,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
@ -15,6 +15,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_nullary() {
|
void test_cuda_nullary() {
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
using Eigen::RowMajor;
|
using Eigen::RowMajor;
|
||||||
|
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template<typename>
|
template<typename>
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <Eigen/CXX11/Tensor>
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
void test_cuda_random_uniform()
|
void test_cuda_random_uniform()
|
||||||
{
|
{
|
||||||
|
@ -15,6 +15,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template<typename Type, int DataLayout>
|
template<typename Type, int DataLayout>
|
||||||
static void test_full_reductions() {
|
static void test_full_reductions() {
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user