From b6a517c47dd5280f25bf23e25160f95714223ff0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 11 May 2016 21:26:48 -0700 Subject: [PATCH] Added the ability to load fp16 using the texture path. Improved the performance of some reductions on fp16 --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index be0e2bdf2..138881996 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -212,8 +212,8 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } @@ -222,8 +222,8 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } @@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return half(__float2half_rn(a1 + a2)); + return half(internal::raw_uint16_to_half(__float2half_rn(a1 + a2))); #endif } @@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return half(__float2half_rn(a1 * a2)); + return half(internal::raw_uint16_to_half(__float2half_rn(a1 * a2))); #endif } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ae4ce3c90..31b361c83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) { return __ldg(address); } +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(internal::raw_uint16_to_half(__ldg(&address->x))); +} #endif }