From b6a517c47dd5280f25bf23e25160f95714223ff0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 11 May 2016 21:26:48 -0700
Subject: [PATCH] Added the ability to load fp16 using the texture path.
 Improved the performance of some reductions on fp16

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h            | 12 ++++++------
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h |  4 ++++
 2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index be0e2bdf2..138881996 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -212,8 +212,8 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }
 
@@ -222,8 +222,8 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }
 
@@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return half(__float2half_rn(a1 + a2));
+  return half(internal::raw_uint16_to_half(__float2half_rn(a1 + a2)));
 #endif
 }
 
@@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return half(__float2half_rn(a1 * a2));
+  return half(internal::raw_uint16_to_half(__float2half_rn(a1 * a2)));
 #endif
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ae4ce3c90..31b361c83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double loadConstant(const double* address) {
   return __ldg(address);
 }
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(internal::raw_uint16_to_half(__ldg(&address->x)));
+}
 #endif
 }