From c2b6df6e60a6b0e59ba4a07fd2895cf2a1a74be4 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 31 Aug 2021 08:33:19 -0700 Subject: [PATCH] Disable cuda Eigen::half vectorization on host. All cuda `__half` functions are device-only in CUDA 9, including conversions. Host-side conversions were added in CUDA 10. The existing code doesn't build prior to 10.0. All arithmetic functions are always device-only, so there's therefore no reason to use vectorization on the host at all. Modified the code to disable vectorization for `__half` on host, which required also updating the `TensorReductionGpu` implementation which previously made assumptions about available packets. (cherry picked from commit cc3573ab4451853774cd5c3497373d5fe8914774) --- Eigen/src/Core/arch/GPU/PacketMath.h | 220 ++++++++---------- Eigen/src/Core/arch/GPU/TypeCasting.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../CXX11/src/Tensor/TensorReductionGpu.h | 57 +++-- 4 files changed, 126 insertions(+), 156 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 689110ded..25c45fd35 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -493,9 +493,10 @@ ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +// Half-packet functions are not available on the host for CUDA 9.0-9.2, only +// on device. There is no benefit to using them on the host anyways, since they are +// emulated. +#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; @@ -526,42 +527,9 @@ template<> struct packet_traits : default_packet_traits }; }; -namespace { -// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __halves2half2(a, b); -#else - // Round-about way since __halves2half2 is a __device__ function. - return __floats2half2_rn(__half2float(a), __half2float(b)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __low2half(a); -#else - return __float2half(__low2float(a)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __high2half(a); -#else - return __float2half(__high2float(a)); -#endif -} -} // namespace - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2half2(from); -#else - const float f = __half2float(from); - return __floats2half2_rn(f, f); -#endif } template <> @@ -576,8 +544,6 @@ pset1(const Eigen::half& from) { return r; } -// We now need this visible on both host and device. -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -585,11 +551,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return combine_half(from[0], from[1]); + return __halves2half2(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return combine_half(from[0], from[0]); + return __halves2half2(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -599,8 +565,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = get_half2_low(from); - to[1] = get_half2_high(from); + to[0] = __low2half(from); + to[1] = __high2half(from); } @@ -610,7 +576,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } @@ -619,31 +585,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( #if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return combine_half(from[0*stride], from[1*stride]); + return __halves2half2(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = get_half2_low(from); - to[stride*1] = get_half2_high(from); + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return get_half2_low(a); + return __low2half(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); + half a1 = __low2half(a); + half a2 = __high2half(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { @@ -658,12 +624,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = get_half2_low(kernel.packet[0]); - __half a2 = get_half2_high(kernel.packet[0]); - __half b1 = get_half2_low(kernel.packet[1]); - __half b2 = get_half2_high(kernel.packet[1]); - kernel.packet[0] = combine_half(a1, b1); - kernel.packet[1] = combine_half(a2, b2); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { @@ -671,88 +637,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return combine_half(a, __float2half(f)); + return __halves2half2(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = get_half2_low(mask); - half mask_high = get_half2_high(mask); - half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); - half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); - return combine_half(result_low, result_high); + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, @@ -851,9 +817,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -862,9 +828,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { @@ -885,7 +851,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? get_half2_low(a) : get_half2_high(a); + return a1 > a2 ? __low2half(a) : __high2half(a); #endif } @@ -897,7 +863,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? get_half2_low(a) : get_half2_high(a); + return a1 < a2 ? __low2half(a) : __high2half(a); #endif } @@ -1068,10 +1034,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); - p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); - p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); - p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); + p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); + p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); + p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); return r; } @@ -1152,12 +1118,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = get_half2_low(f0); - __half a2 = get_half2_high(f0); - __half b1 = get_half2_low(f1); - __half b2 = get_half2_high(f1); - f0 = combine_half(a1, b1); - f1 = combine_half(a2, b2); + __half a1 = __low2half(f0); + __half a2 = __high2half(f0); + __half b1 = __low2half(f1); + __half b2 = __high2half(f1); + f0 = __halves2half2(a1, b1); + f1 = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1254,10 +1220,10 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(a, __float2half(f + 1.0f)); - p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); + p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif } @@ -1477,9 +1443,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_max(a_alias[0]), + half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = combine_half(predux_max(a_alias[2]), + half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1496,9 +1462,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_min(a_alias[0]), + half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = combine_half(predux_min(a_alias[2]), + half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1652,9 +1618,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } template<> @@ -1664,14 +1630,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } -// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) - -#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) #undef EIGEN_GPU_HAS_LDG #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 754546225..c8195bb2b 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -15,8 +15,7 @@ namespace Eigen { namespace internal { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index a6181d35e..b90a1dcd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index db4e8d866..315ccc172 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -98,6 +98,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { } } } +#ifdef EIGEN_GPU_COMPILE_PHASE // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations template __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { @@ -107,6 +108,7 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc atomicReduce(houtput+i,*(haccum+i),reducer); } } +#endif // EIGEN_GPU_COMPILE_PHASE #endif // EIGEN_HAS_GPU_FP16 template <> @@ -213,8 +215,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer #ifdef EIGEN_HAS_GPU_FP16 template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, half* scratch) { eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); typedef packet_traits::type packet_type; @@ -224,15 +226,16 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo half2* h2scratch = reinterpret_cast(scratch); for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { *h2scratch = - __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); + __halves2half2(input.coeff(i), input.coeff(i + 1)); h2scratch++; } if ((num_coeffs & 1) != 0) { - half lastCoeff = input.m_impl.coeff(num_coeffs - 1); + half lastCoeff = input.coeff(num_coeffs - 1); *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); } } @@ -258,8 +261,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reduce template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, + half* output, half* scratch) { typedef typename packet_traits::type PacketType; const int packet_width = unpacket_traits::size; eigen_assert(NumPerThread % packet_width == 0); @@ -273,19 +277,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce int rem = num_coeffs % packet_width; if (rem != 0) { half2* p_scratch = reinterpret_cast(scratch); - *scratch = reducer.template initializePacket(); + pstoreu(scratch, reducer.template initializePacket()); for (int i = 0; i < rem / 2; i++) { *p_scratch = __halves2half2( - input.m_impl.coeff(num_coeffs - packet_width + 2 * i), - input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); + input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); p_scratch++; } if ((num_coeffs & 1) != 0) { - half last = input.m_impl.coeff(num_coeffs - 1); + half last = input.coeff(num_coeffs - 1); *p_scratch = __halves2half2(last, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); } } __syncthreads(); @@ -298,7 +303,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce for (Index i = 0; i < max_iter; i += BlockSize) { const Index index = first_index + packet_width * i; eigen_assert(index + packet_width < num_coeffs); - PacketType val = input.m_impl.template packet(index); + PacketType val = input.template packet(index); reducer.reducePacket(val, &accum); } @@ -337,7 +342,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(scratch, accum, reducer); + atomicReduce(reinterpret_cast(scratch), accum, reducer); } __syncthreads(); @@ -357,17 +362,21 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) { eigen_assert(threadIdx.x == 1); - half2* pscratch = reinterpret_cast(scratch); - half tmp = __float2half(0.f); typedef packet_traits::type packet_type; - for (int i = 0; i < unpacket_traits::size; i += 2) { - reducer.reduce(__low2half(*pscratch), &tmp); - reducer.reduce(__high2half(*pscratch), &tmp); - pscratch++; + if (unpacket_traits::size == 1) { + *output = *scratch; + } else { + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; } - *output = tmp; } #endif // EIGEN_HAS_GPU_FP16 @@ -416,13 +425,11 @@ template struct FullReductionLauncher { static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename packet_traits::type PacketType; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); - PacketType* scratch = static_cast(device.scratchpad()); - // half2* scratch = static_cast(device.scratchpad()); + half* scratch = static_cast(device.scratchpad()); if (num_blocks > 1) { // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there