mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-09 14:41:49 +08:00
Disable cuda Eigen::half vectorization on host.
All cuda `__half` functions are device-only in CUDA 9, including conversions. Host-side conversions were added in CUDA 10. The existing code doesn't build prior to 10.0. All arithmetic functions are always device-only, so there's therefore no reason to use vectorization on the host at all. Modified the code to disable vectorization for `__half` on host, which required also updating the `TensorReductionGpu` implementation which previously made assumptions about available packets. (cherry picked from commit cc3573ab4451853774cd5c3497373d5fe8914774)
This commit is contained in:
parent
277d369060
commit
c2b6df6e60
@ -493,9 +493,10 @@ ptranspose(PacketBlock<double2,2>& kernel) {
|
||||
|
||||
#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
|
||||
// its corresponding packet_traits<Eigen::half> must be visible on host.
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
|
||||
// on device. There is no benefit to using them on the host anyways, since they are
|
||||
// emulated.
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
typedef ulonglong2 Packet4h2;
|
||||
template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
|
||||
@ -526,42 +527,9 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
namespace {
|
||||
// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __halves2half2(a, b);
|
||||
#else
|
||||
// Round-about way since __halves2half2 is a __device__ function.
|
||||
return __floats2half2_rn(__half2float(a), __half2float(b));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __low2half(a);
|
||||
#else
|
||||
return __float2half(__low2float(a));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __high2half(a);
|
||||
#else
|
||||
return __float2half(__high2float(a));
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __half2half2(from);
|
||||
#else
|
||||
const float f = __half2float(from);
|
||||
return __floats2half2_rn(f, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -576,8 +544,6 @@ pset1<Packet4h2>(const Eigen::half& from) {
|
||||
return r;
|
||||
}
|
||||
|
||||
// We now need this visible on both host and device.
|
||||
// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
|
||||
@ -585,11 +551,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
|
||||
return combine_half(from[0], from[1]);
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
|
||||
return combine_half(from[0], from[0]);
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
|
||||
@ -599,8 +565,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
|
||||
const half2& from) {
|
||||
to[0] = get_half2_low(from);
|
||||
to[1] = get_half2_high(from);
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
|
||||
@ -610,7 +576,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
|
||||
// Input is guaranteed to be properly aligned.
|
||||
return __ldg(reinterpret_cast<const half2*>(from));
|
||||
#else
|
||||
return combine_half(*(from+0), *(from+1));
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -619,31 +585,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return combine_half(*(from+0), *(from+1));
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
|
||||
Index stride) {
|
||||
return combine_half(from[0*stride], from[1*stride]);
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
|
||||
Eigen::half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = get_half2_low(from);
|
||||
to[stride*1] = get_half2_high(from);
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
|
||||
return get_half2_low(a);
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
|
||||
half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
|
||||
return combine_half(result1, result2);
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
|
||||
@ -658,12 +624,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
__half a1 = get_half2_low(kernel.packet[0]);
|
||||
__half a2 = get_half2_high(kernel.packet[0]);
|
||||
__half b1 = get_half2_low(kernel.packet[1]);
|
||||
__half b2 = get_half2_high(kernel.packet[1]);
|
||||
kernel.packet[0] = combine_half(a1, b1);
|
||||
kernel.packet[1] = combine_half(a2, b2);
|
||||
__half a1 = __low2half(kernel.packet[0]);
|
||||
__half a2 = __high2half(kernel.packet[0]);
|
||||
__half b1 = __low2half(kernel.packet[1]);
|
||||
__half b2 = __high2half(kernel.packet[1]);
|
||||
kernel.packet[0] = __halves2half2(a1, b1);
|
||||
kernel.packet[1] = __halves2half2(a2, b2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
||||
@ -671,88 +637,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
return combine_half(a, __float2half(f));
|
||||
return __halves2half2(a, __float2half(f));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
|
||||
const half2& a,
|
||||
const half2& b) {
|
||||
half mask_low = get_half2_low(mask);
|
||||
half mask_high = get_half2_high(mask);
|
||||
half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
|
||||
half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
|
||||
return combine_half(result_low, result_high);
|
||||
half mask_low = __low2half(mask);
|
||||
half mask_high = __high2half(mask);
|
||||
half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
|
||||
half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
|
||||
return __halves2half2(result_low, result_high);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
|
||||
const half2& b) {
|
||||
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
||||
half false_half = half_impl::raw_uint16_to_half(0x0000u);
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
|
||||
half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
|
||||
return combine_half(eq1, eq2);
|
||||
return __halves2half2(eq1, eq2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
|
||||
const half2& b) {
|
||||
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
||||
half false_half = half_impl::raw_uint16_to_half(0x0000u);
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
|
||||
half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
|
||||
return combine_half(eq1, eq2);
|
||||
return __halves2half2(eq1, eq2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
|
||||
const half2& b) {
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
|
||||
half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
|
||||
return combine_half(result1, result2);
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
|
||||
const half2& b) {
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
|
||||
half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
|
||||
return combine_half(result1, result2);
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
|
||||
const half2& b) {
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
|
||||
half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
|
||||
return combine_half(result1, result2);
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
|
||||
const half2& b) {
|
||||
half a1 = get_half2_low(a);
|
||||
half a2 = get_half2_high(a);
|
||||
half b1 = get_half2_low(b);
|
||||
half b2 = get_half2_high(b);
|
||||
half a1 = __low2half(a);
|
||||
half a2 = __high2half(a);
|
||||
half b1 = __low2half(b);
|
||||
half b2 = __high2half(b);
|
||||
half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
|
||||
half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
|
||||
return combine_half(result1, result2);
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
|
||||
@ -851,9 +817,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
__half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
|
||||
__half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
|
||||
return combine_half(r1, r2);
|
||||
__half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
||||
__half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
|
||||
@ -862,9 +828,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
__half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
|
||||
__half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
|
||||
return combine_half(r1, r2);
|
||||
__half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
||||
__half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
|
||||
@ -885,7 +851,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
|
||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -897,7 +863,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
|
||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1068,10 +1034,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
||||
pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
|
||||
Packet4h2 r;
|
||||
half2* p_alias = reinterpret_cast<half2*>(&r);
|
||||
p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
|
||||
p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
|
||||
p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
|
||||
p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
|
||||
p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
|
||||
p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
|
||||
p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
|
||||
p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -1152,12 +1118,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
ptranspose_half(half2& f0, half2& f1) {
|
||||
__half a1 = get_half2_low(f0);
|
||||
__half a2 = get_half2_high(f0);
|
||||
__half b1 = get_half2_low(f1);
|
||||
__half b2 = get_half2_high(f1);
|
||||
f0 = combine_half(a1, b1);
|
||||
f1 = combine_half(a2, b2);
|
||||
__half a1 = __low2half(f0);
|
||||
__half a2 = __high2half(f0);
|
||||
__half b1 = __low2half(f1);
|
||||
__half b2 = __high2half(f1);
|
||||
f0 = __halves2half2(a1, b1);
|
||||
f1 = __halves2half2(a2, b2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
@ -1254,10 +1220,10 @@ plset<Packet4h2>(const Eigen::half& a) {
|
||||
float f = __half2float(a);
|
||||
Packet4h2 r;
|
||||
half2* p_alias = reinterpret_cast<half2*>(&r);
|
||||
p_alias[0] = combine_half(a, __float2half(f + 1.0f));
|
||||
p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
|
||||
p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
|
||||
p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
|
||||
p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
|
||||
p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
|
||||
p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
|
||||
p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
@ -1477,9 +1443,9 @@ template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
|
||||
const Packet4h2& a) {
|
||||
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
||||
half2 m0 = combine_half(predux_max(a_alias[0]),
|
||||
half2 m0 = __halves2half2(predux_max(a_alias[0]),
|
||||
predux_max(a_alias[1]));
|
||||
half2 m1 = combine_half(predux_max(a_alias[2]),
|
||||
half2 m1 = __halves2half2(predux_max(a_alias[2]),
|
||||
predux_max(a_alias[3]));
|
||||
__half first = predux_max(m0);
|
||||
__half second = predux_max(m1);
|
||||
@ -1496,9 +1462,9 @@ template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
|
||||
const Packet4h2& a) {
|
||||
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
||||
half2 m0 = combine_half(predux_min(a_alias[0]),
|
||||
half2 m0 = __halves2half2(predux_min(a_alias[0]),
|
||||
predux_min(a_alias[1]));
|
||||
half2 m1 = combine_half(predux_min(a_alias[2]),
|
||||
half2 m1 = __halves2half2(predux_min(a_alias[2]),
|
||||
predux_min(a_alias[3]));
|
||||
__half first = predux_min(m0);
|
||||
__half second = predux_min(m1);
|
||||
@ -1652,9 +1618,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
__half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
|
||||
__half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
|
||||
return combine_half(r1, r2);
|
||||
__half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
||||
__half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<>
|
||||
@ -1664,14 +1630,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
__half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
|
||||
__half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
|
||||
return combine_half(r1, r2);
|
||||
__half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
||||
__half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
||||
|
||||
#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
#undef EIGEN_GPU_HAS_LDG
|
||||
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
|
||||
|
@ -15,8 +15,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
|
@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
typedef ulonglong2 Packet4h2;
|
||||
template<>
|
||||
|
@ -98,6 +98,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||
// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
|
||||
template <typename R>
|
||||
__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
|
||||
@ -107,6 +108,7 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
|
||||
atomicReduce(houtput+i,*(haccum+i),reducer);
|
||||
}
|
||||
}
|
||||
#endif // EIGEN_GPU_COMPILE_PHASE
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <>
|
||||
@ -213,8 +215,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
|
||||
packet_traits<Eigen::half>::type* scratch) {
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
|
||||
Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
|
||||
eigen_assert(blockDim.x == 1);
|
||||
eigen_assert(gridDim.x == 1);
|
||||
typedef packet_traits<Eigen::half>::type packet_type;
|
||||
@ -224,15 +226,16 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo
|
||||
half2* h2scratch = reinterpret_cast<half2*>(scratch);
|
||||
for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
|
||||
*h2scratch =
|
||||
__halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
|
||||
__halves2half2(input.coeff(i), input.coeff(i + 1));
|
||||
h2scratch++;
|
||||
}
|
||||
if ((num_coeffs & 1) != 0) {
|
||||
half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
|
||||
half lastCoeff = input.coeff(num_coeffs - 1);
|
||||
*h2scratch = __halves2half2(lastCoeff, reducer.initialize());
|
||||
}
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<packet_type>();
|
||||
packet_type reduce = reducer.template initializePacket<packet_type>();
|
||||
internal::pstoreu(scratch, reduce);
|
||||
}
|
||||
}
|
||||
|
||||
@ -258,8 +261,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reduce
|
||||
|
||||
template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
|
||||
half* output, packet_traits<Eigen::half>::type* scratch) {
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
|
||||
Reducer reducer, const Self input, Index num_coeffs,
|
||||
half* output, half* scratch) {
|
||||
typedef typename packet_traits<Eigen::half>::type PacketType;
|
||||
const int packet_width = unpacket_traits<PacketType>::size;
|
||||
eigen_assert(NumPerThread % packet_width == 0);
|
||||
@ -273,19 +277,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
|
||||
int rem = num_coeffs % packet_width;
|
||||
if (rem != 0) {
|
||||
half2* p_scratch = reinterpret_cast<half2*>(scratch);
|
||||
*scratch = reducer.template initializePacket<PacketType>();
|
||||
pstoreu(scratch, reducer.template initializePacket<PacketType>());
|
||||
for (int i = 0; i < rem / 2; i++) {
|
||||
*p_scratch = __halves2half2(
|
||||
input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
|
||||
input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
|
||||
input.coeff(num_coeffs - packet_width + 2 * i),
|
||||
input.coeff(num_coeffs - packet_width + 2 * i + 1));
|
||||
p_scratch++;
|
||||
}
|
||||
if ((num_coeffs & 1) != 0) {
|
||||
half last = input.m_impl.coeff(num_coeffs - 1);
|
||||
half last = input.coeff(num_coeffs - 1);
|
||||
*p_scratch = __halves2half2(last, reducer.initialize());
|
||||
}
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<PacketType>();
|
||||
PacketType reduce = reducer.template initializePacket<PacketType>();
|
||||
pstoreu(scratch, reduce);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
@ -298,7 +303,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
|
||||
for (Index i = 0; i < max_iter; i += BlockSize) {
|
||||
const Index index = first_index + packet_width * i;
|
||||
eigen_assert(index + packet_width < num_coeffs);
|
||||
PacketType val = input.m_impl.template packet<Unaligned>(index);
|
||||
PacketType val = input.template packet<Unaligned>(index);
|
||||
reducer.reducePacket(val, &accum);
|
||||
}
|
||||
|
||||
@ -337,7 +342,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
atomicReduce(scratch, accum, reducer);
|
||||
atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@ -357,17 +362,21 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
|
||||
eigen_assert(threadIdx.x == 1);
|
||||
half2* pscratch = reinterpret_cast<half2*>(scratch);
|
||||
half tmp = __float2half(0.f);
|
||||
typedef packet_traits<Eigen::half>::type packet_type;
|
||||
for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
|
||||
reducer.reduce(__low2half(*pscratch), &tmp);
|
||||
reducer.reduce(__high2half(*pscratch), &tmp);
|
||||
pscratch++;
|
||||
if (unpacket_traits<packet_type>::size == 1) {
|
||||
*output = *scratch;
|
||||
} else {
|
||||
half2* pscratch = reinterpret_cast<half2*>(scratch);
|
||||
half tmp = __float2half(0.f);
|
||||
for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
|
||||
reducer.reduce(__low2half(*pscratch), &tmp);
|
||||
reducer.reduce(__high2half(*pscratch), &tmp);
|
||||
pscratch++;
|
||||
}
|
||||
*output = tmp;
|
||||
}
|
||||
*output = tmp;
|
||||
}
|
||||
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
@ -416,13 +425,11 @@ template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
||||
typedef typename Self::Index Index;
|
||||
typedef typename packet_traits<Eigen::half>::type PacketType;
|
||||
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
|
||||
// half2* scratch = static_cast<half2*>(device.scratchpad());
|
||||
half* scratch = static_cast<half*>(device.scratchpad());
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
|
||||
|
Loading…
x
Reference in New Issue
Block a user