From 8adf5cc70f232f919302332e2a2fa0971c34f264 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 6 May 2016 19:16:43 -0700
Subject: [PATCH] Added support for packet processing of fp16 on kepler and
 maxwell gpus

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 99 +++++++++++++++++++++--
 Eigen/src/Core/arch/CUDA/TypeCasting.h    |  2 +-
 2 files changed, 93 insertions(+), 8 deletions(-)
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 61d532e4d..0cebc1017 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -17,8 +17,8 @@
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 
-// Most of the following operations require arch >= 5.3
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+// Most of the following operations require arch >= 3.0
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
 namespace Eigen {
 namespace internal {
@@ -67,13 +67,21 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co
 }
 
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
-  return __ldg((const half2*)from);
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+#if __CUDA_ARCH__ >= 320
+   return __ldg((const half2*)from);
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
 }
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
+#if __CUDA_ARCH__ >= 320
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
@@ -107,30 +115,83 @@ ptranspose(PacketBlock<half2,2>& kernel) {
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+#if __CUDA_ARCH__ >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
    return __hfma2(a, b, c);
- }
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c2;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
@@ -163,23 +224,47 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(a1 + a2));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   half first = __low2half(a);
   half second = __high2half(a);
   return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(numext::maxi(a1, a2)));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   half first = __low2half(a);
   half second = __high2half(a);
   return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(numext::mini(a1, a2)));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(a1 * a2));
+#endif
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index 396b38eaf..3ea218133 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -71,7 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> >
 
 
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
 template <>
 struct type_casting_traits<half, float> {