From a9cc6a06b9fe2ea50b0437f73b101116a39da727 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 9 Feb 2016 05:10:06 +0000
Subject: [PATCH 01/18] Fixed compilation warning in the splines test

---
 unsupported/test/splines.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 97665af96..3be020434 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d()
   typedef Spline2d::PointType PointType;
   typedef Spline2d::KnotVectorType KnotVectorType;
 
-  const unsigned int numPoints = 100;
+  const Eigen::DenseIndex numPoints = 100;
   const unsigned int dimension = 2;
   const unsigned int degree = 3;
 

From 06a2bc7c9c6af150f54605c74a95379a7c12ca28 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@gmail.com>
Date: Wed, 17 Feb 2016 14:41:59 -0800
Subject: [PATCH 02/18] Tiny bugfix in SpecialFunctions: some compilers don't
 like doubles implicitly downcast to floats in an array constructor.

---
 Eigen/src/Core/SpecialFunctions.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index 6c6b21f98..6b4598e3e 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -182,10 +182,10 @@ struct digamma_impl_maybe_poly<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(const float s) {
     const float A[] = {
-      -4.16666666666666666667E-3,
-      3.96825396825396825397E-3,
-      -8.33333333333333333333E-3,
-      8.33333333333333333333E-2
+      -4.16666666666666666667E-3f,
+      3.96825396825396825397E-3f,
+      -8.33333333333333333333E-3f,
+      8.33333333333333333333E-2f
     };
 
     float z;

From 8ce46f9d8959236c0dfb6dd7dca7423d825f0c59 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 18 Feb 2016 13:24:34 -0800
Subject: [PATCH 03/18] Improved implementation of ptanh for SSE and AVX

---
 Eigen/src/Core/arch/AVX/MathFunctions.h | 42 +++++++++++--------------
 Eigen/src/Core/arch/SSE/MathFunctions.h | 33 +++++++++----------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index a24bf6e26..98d8e029f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -267,31 +267,34 @@ pexp<Packet8f>(const Packet8f& _x) {
 
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
+// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 ptanh<Packet8f>(const Packet8f& _x) {
-  // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
-  const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f));
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
+  const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
 
   // Since the polynomials are odd/even, we need x^2.
-  const Packet8f x2 = _mm256_mul_ps(x, x);
+  const Packet8f x2 = pmul(x, x);
 
   // Evaluate the numerator polynomial p.
   Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
@@ -308,14 +311,7 @@ ptanh<Packet8f>(const Packet8f& _x) {
   q = pmadd(x2, q, p8f_beta_0);
 
   // Divide the numerator by the denominator.
-  const Packet8f res = pdiv(p, q);
-
-  // Mask-out values outside of [-8, 8].
-  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f);
-  return _mm256_blendv_ps(
-      _mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)),
-      p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ));
+  return pdiv(p, q);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index a7a0d906f..28f103eeb 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -518,30 +518,31 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
+// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 ptanh<Packet4f>(const Packet4f& _x) {
-  // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
-  const Packet4f x =
-      pmax(pset1<Packet4f>(-1.0f),
-           pmin(pset1<Packet4f>(1.0f), pmul(_x, pset1<Packet4f>(0.125f))));
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
+  const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
 
   // Since the polynomials are odd/even, we need x^2.
   const Packet4f x2 = pmul(x, x);

From 17b9fbed34cefe08b4f63dbe0734e12311eb8669 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 06:16:07 +0000
Subject: [PATCH 04/18] Added preliminary support for half floats on CUDA GPU.
 For now we can simply convert floats into half floats and vice versa

---
 Eigen/Core                             |   3 +
 Eigen/src/Core/arch/CUDA/PacketMath.h  |   1 -
 Eigen/src/Core/arch/CUDA/TypeCasting.h | 100 +++++++++++++++++++++++++
 unsupported/test/CMakeLists.txt        |   8 +-
 4 files changed, 109 insertions(+), 3 deletions(-)
 create mode 100644 Eigen/src/Core/arch/CUDA/TypeCasting.h

diff --git a/Eigen/Core b/Eigen/Core
index 63602f4c3..17f864084 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -200,6 +200,7 @@
 #if defined __CUDACC__
   #define EIGEN_VECTORIZE_CUDA
   #include <vector_types.h>
+  #include <cuda_fp16.h>
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -329,7 +330,9 @@ using std::ptrdiff_t;
 
 #if defined EIGEN_VECTORIZE_CUDA
   #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/PacketMathHalf.h"
   #include "src/Core/arch/CUDA/MathFunctions.h"
+  #include "src/Core/arch/CUDA/TypeCasting.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index d3d9f910e..d5dcc7fa3 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -21,7 +21,6 @@ namespace internal {
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };
 
-
 template<> struct packet_traits<float> : default_packet_traits
 {
   typedef float4 type;
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
new file mode 100644
index 000000000..a8c06ff48
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __float2half(a);
+    #else
+      assert(false && "tbd");
+      return half();
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+template<>
+struct scalar_cast_op<half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __half2float(a);
+    #else
+      assert(false && "tbd");
+      return 0.0f;
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+#else
+  assert(false && "tbd");
+  return float4();
+#endif
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __float22half2_rn(make_float2(a.x, a.y));
+#else
+  assert(false && "tbd");
+  return half2();
+#endif
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_CUDA_H
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index c202cf0e4..678a0d1d7 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
 endif()
 
-ei_add_test(matrix_exponential)
+#ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
-ei_add_test(matrix_power)
+#ei_add_test(matrix_power)
 ei_add_test(matrix_square_root)
 ei_add_test(alignedvector3)
 
@@ -173,5 +173,9 @@ if(CUDA_FOUND)
   ei_add_test(cxx11_tensor_random_cuda)
   ei_add_test(cxx11_tensor_argmax_cuda)
 
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_53 -Xcudafe \"--display_error_number\"")
+  ei_add_test(cxx11_tensor_of_float16_cuda)
+
+
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()

From 7151bd876845c15cb6b8abc0886d7917ece635ed Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 06:20:50 +0000
Subject: [PATCH 05/18] Reverted unintended changes introduced by a bad merge

---
 Eigen/Core                      | 1 -
 unsupported/test/CMakeLists.txt | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 17f864084..3edbe6585 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -330,7 +330,6 @@ using std::ptrdiff_t;
 
 #if defined EIGEN_VECTORIZE_CUDA
   #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/PacketMathHalf.h"
   #include "src/Core/arch/CUDA/MathFunctions.h"
   #include "src/Core/arch/CUDA/TypeCasting.h"
 #endif
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 678a0d1d7..2c686177b 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
 endif()
 
-#ei_add_test(matrix_exponential)
+ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
-#ei_add_test(matrix_power)
+ei_add_test(matrix_power)
 ei_add_test(matrix_square_root)
 ei_add_test(alignedvector3)
 

From f36c0c2c65a78959f6ccbbc29c6e80f86b062bc8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 06:23:28 +0000
Subject: [PATCH 06/18] Added regression test for float16

---
 .../test/cxx11_tensor_of_float16_cuda.cu      | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 unsupported/test/cxx11_tensor_of_float16_cuda.cu

diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
new file mode 100644
index 000000000..e9f5dd968
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -0,0 +1,60 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+  
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_cxx11_tensor_of_float16_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_conversion());
+}

From 0606a0a39bcf01b0a03f0dcd17f7075fce8c402c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 18 Feb 2016 23:15:23 -0800
Subject: [PATCH 07/18] FP16 on CUDA are only available starting with cuda 7.5.
 Disable them when using an older version of CUDA

---
 Eigen/Core                                       | 5 ++++-
 Eigen/src/Core/arch/CUDA/TypeCasting.h           | 3 +++
 unsupported/test/cxx11_tensor_of_float16_cuda.cu | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 3edbe6585..834ff9415 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -200,7 +200,10 @@
 #if defined __CUDACC__
   #define EIGEN_VECTORIZE_CUDA
   #include <vector_types.h>
-  #include <cuda_fp16.h>
+  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+    #include <cuda_fp16.h>
+  #endif
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index a8c06ff48..279fd4fd0 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -14,6 +14,8 @@ namespace Eigen {
 
 namespace internal {
 
+#if defined(EIGEN_HAS_CUDA_FP16)
+
 template<>
 struct scalar_cast_op<float, half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
@@ -92,6 +94,7 @@ template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
 #endif
 }
 
+#endif
 
 } // end namespace internal
 
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index e9f5dd968..aee222a14 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -19,6 +19,7 @@
 
 using Eigen::Tensor;
 
+#ifdef EIGEN_HAS_CUDA_FP16
 void test_cuda_conversion() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -52,9 +53,11 @@ void test_cuda_conversion() {
   gpu_device.deallocate(d_half);
   gpu_device.deallocate(d_conv);
 }
-
+#endif
 
 void test_cxx11_tensor_of_float16_cuda()
 {
+#ifdef EIGEN_HAS_CUDA_FP16
   CALL_SUBTEST_1(test_cuda_conversion());
+#endif
 }

From ac5d706a942faa275ea467009d2004a7aeb3e3fa Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 08:19:12 +0000
Subject: [PATCH 08/18] Added support for simple coefficient wise tensor
 expression using half floats on CUDA devices

---
 Eigen/Core                                    |   1 +
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h     | 220 ++++++++++++++++++
 .../test/cxx11_tensor_of_float16_cuda.cu      |  43 ++++
 3 files changed, 264 insertions(+)
 create mode 100644 Eigen/src/Core/arch/CUDA/PacketMathHalf.h

diff --git a/Eigen/Core b/Eigen/Core
index 834ff9415..7107f83d0 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -333,6 +333,7 @@ using std::ptrdiff_t;
 
 #if defined EIGEN_VECTORIZE_CUDA
   #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/PacketMathHalf.h"
   #include "src/Core/arch/CUDA/MathFunctions.h"
   #include "src/Core/arch/CUDA/TypeCasting.h"
 #endif
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
new file mode 100644
index 000000000..7f99376fb
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -0,0 +1,220 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
+#define EIGEN_PACKET_MATH_HALF_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+
+__device__ half operator + (const half& a, const half& b) {
+  return __hadd(a, b);
+}
+__device__ half operator * (const half& a, const half& b) {
+  return __hmul(a, b);
+}
+__device__ half operator - (const half& a, const half& b) {
+  return __hsub(a, b);
+}
+__device__ half operator / (const half& a, const half& b) {
+  assert(false && "tbd");
+  return half();
+}
+__device__ half operator - (const half& a) {
+  return __hneg(a);
+}
+
+
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<half> : default_packet_traits
+{
+  typedef half2 type;
+  typedef half2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
+  return __half2half2(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+  return __halves2half2(a, __hadd(a, __float2half(1)));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+  return __hadd2(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+  return __hsub2(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+  return __hneg2(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+  return __hmul2(a, b);
+}
+
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+   return __hfma2(a, b, c);
+ }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  half r2 = a2 < b2 ? __high2half(a) : __high2half(b); 
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+  return __ldg((const half2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
+  return __low2half(a);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+  return __hadd(__low2half(a), __high2half(a));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+  half first = __low2half(a);
+  half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+  half first = __low2half(a);
+  half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+  return __hmul(__low2half(a), __high2half(a));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
+  assert(false && "tbd");
+  return half2();
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  assert(false && "tbd");
+  //  half tmp = kernel.packet[0].y;
+  //  kernel.packet[0].y = kernel.packet[1].x;
+  //  kernel.packet[1].x = tmp;
+}
+
+#endif
+#endif
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index aee222a14..26c18a718 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -20,6 +20,7 @@
 using Eigen::Tensor;
 
 #ifdef EIGEN_HAS_CUDA_FP16
+
 void test_cuda_conversion() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -53,11 +54,53 @@ void test_cuda_conversion() {
   gpu_device.deallocate(d_half);
   gpu_device.deallocate(d_conv);
 }
+
+void test_cuda_elementwise() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<half>() + gpu_float2.cast<half>()) * gpu_float1.cast<half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
 #endif
 
+
 void test_cxx11_tensor_of_float16_cuda()
 {
 #ifdef EIGEN_HAS_CUDA_FP16
   CALL_SUBTEST_1(test_cuda_conversion());
+  CALL_SUBTEST_1(test_cuda_element_wise());
 #endif
 }

From cd042dbbfdd4680c983d89c4f526c49d4657c05d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 15:03:26 +0000
Subject: [PATCH 09/18] Fixed a bug in the tensor type converter

---
 unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index d2defcaf4..e254c0b7b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -124,9 +124,12 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
       return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
     } else {
       const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
+      internal::scalar_cast_op<SrcType, TgtType> converter;
       EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
       for (int i = 0; i < TgtPacketSize; ++i) {
-        values[i] = m_impl.coeff(index+i);
+        values[i] = converter(m_impl.coeff(index+i));
       }
       TgtPacket rslt = internal::pload<TgtPacket>(values);
       return rslt;

From dc26459b9910d8c1fda964917635ee8277dd2614 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 15:16:54 +0000
Subject: [PATCH 10/18] Implemented protate() for CUDA

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 29 +++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index d5dcc7fa3..a32b41e18 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -272,6 +272,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
   return a.x * a.y;
 }
 
+template<size_t offset>
+struct protate_impl<offset, float4>
+{
+  static float4 run(const float4& a) {
+    if (offset == 0) {
+      return make_float4(a.x, a.y, a.z, a.w);
+    }
+    if (offset == 1) {
+      return make_float4(a.w, a.x, a.y, a.z);
+    }
+    if (offset == 2) {
+      return make_float4(a.z, a.w, a.x, a.y);
+    }
+    return make_float4(a.y, a.z, a.w, a.x);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, double2>
+{
+  static double2 run(const double2& a) {
+    if (offset == 0) {
+      return make_double2(a.x, a.y);
+    }
+    return make_double2(a.y, a.x);
+  }
+};
+
+
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }

From f7cb755299b8cdee3b8eaffd2941af9ee6d08b04 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 15:57:26 +0000
Subject: [PATCH 11/18] Added support for operators +=, -=, *= and /= on CUDA
 half floats

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 7f99376fb..c99c1acf7 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -39,6 +39,22 @@ __device__ half operator / (const half& a, const half& b) {
 __device__ half operator - (const half& a) {
   return __hneg(a);
 }
+__device__ half operator += (half& a, const half& b) {
+   a = __hadd(a, b);
+   return a;
+}
+__device__ half operator *= (half& a, const half& b) {
+  a = __hmul(a, b);
+  return a;
+}
+__device__ half operator -= (half& a, const half& b) {
+  a = __hsub(a, b);
+  return a;
+}
+__device__ half operator /= (half& a, const half& b) {
+  assert(false && "tbd");
+  return a;
+}
 
 
 template<> struct is_arithmetic<half2> { enum { value = true }; };

From f3352e0fb02d1048a4c21c969b10e84185f4e5bf Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 15:58:57 +0000
Subject: [PATCH 12/18] Don't make the array constructors explicit

---
 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index 56e2b8afc..eae8b996c 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -42,7 +42,7 @@ template <typename T, size_t n> class array {
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array() { }
-  explicit EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array(const T& v) {
     EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v;

From a08d2ff0c911f7f27dd8cb0ca14fa5b9419b3488 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 15:59:59 +0000
Subject: [PATCH 13/18] Started to work on contractions and reductions using
 half floats

---
 .../test/cxx11_tensor_of_float16_cuda.cu      | 95 ++++++++++++++++++-
 1 file changed, 94 insertions(+), 1 deletion(-)

diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 26c18a718..d3cd94cd6 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -94,6 +94,97 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_float);
 }
 
+/*
+void test_cuda_contractions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 101;
+  int cols = 101;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims);
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<half>().contract(gpu_float2.cast<half>(), dims).cast<float>();
+
+  Tensor<float, 2> half_prec(rows, cols);
+  Tensor<float, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+void test_cuda_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 101;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, size);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  Eigen::array<int, 1> redux_dim = {{0}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim);
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<half>().sum(redux_dim).cast<float>();
+
+  Tensor<float, 1> half_prec(size);
+  Tensor<float, 1> full_prec(size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+
+  for (int i = 0; i < size; ++i) {
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+*/
+
 #endif
 
 
@@ -101,6 +192,8 @@ void test_cxx11_tensor_of_float16_cuda()
 {
 #ifdef EIGEN_HAS_CUDA_FP16
   CALL_SUBTEST_1(test_cuda_conversion());
-  CALL_SUBTEST_1(test_cuda_element_wise());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+//  CALL_SUBTEST_2(test_cuda_contractions());
+//  CALL_SUBTEST_3(test_cuda_reductions());
 #endif
 }

From f268db1c4bd0510f13f0218205c2e135f2790175 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 16:31:04 +0000
Subject: [PATCH 14/18] Added the ability to query the minor version of a cuda
 device

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index e684ab8f7..3808eb155 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -247,6 +247,14 @@ struct GpuDevice {
     return 0;
 #endif
   }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    return stream_->deviceProperties().minor;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
     return max_blocks_;

From 5c4901b83a3ec15988521e195abc05e804c541dc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 10:03:19 -0800
Subject: [PATCH 15/18] Implemented the scalar division of 2 half floats

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index c99c1acf7..d0106f4f1 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -33,8 +33,9 @@ __device__ half operator - (const half& a, const half& b) {
   return __hsub(a, b);
 }
 __device__ half operator / (const half& a, const half& b) {
-  assert(false && "tbd");
-  return half();
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
 }
 __device__ half operator - (const half& a) {
   return __hneg(a);
@@ -52,7 +53,7 @@ __device__ half operator -= (half& a, const half& b) {
   return a;
 }
 __device__ half operator /= (half& a, const half& b) {
-  assert(false && "tbd");
+  a = a / b;
   return a;
 }
 

From 180156ba1aefceae0bd93f056e5807a83ccbb1b5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 10:05:59 -0800
Subject: [PATCH 16/18] Added support for tensor reductions on half floats

---
 Eigen/src/Core/arch/CUDA/TypeCasting.h        | 20 ++++++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   | 15 +++++++-----
 .../test/cxx11_tensor_of_float16_cuda.cu      | 23 +++++++++++++------
 3 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index 279fd4fd0..2742a4e7b 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -34,6 +34,26 @@ template<>
 struct functor_traits<scalar_cast_op<float, half> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 
+
+template<>
+struct scalar_cast_op<int, half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __float2half(static_cast<float>(a));
+    #else
+      assert(false && "tbd");
+      return half();
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
 template<>
 struct scalar_cast_op<half, float> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index f94ffa020..e2d876140 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -72,11 +72,12 @@ template <typename T> struct SumReducer
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return static_cast<T>(0);
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(0);
+    return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
     return accum;
@@ -110,11 +111,12 @@ template <typename T> struct MeanReducer
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return static_cast<T>(0);
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(0);
+    return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
     return accum / scalarCount_;
@@ -214,11 +216,12 @@ template <typename T> struct ProdReducer
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return static_cast<T>(1);
+    internal::scalar_cast_op<int, T> conv;
+    return conv(1);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(1);
+    return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
     return accum;
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index d3cd94cd6..5ce96a1c2 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -93,7 +93,6 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
-
 /*
 void test_cuda_contractions() {
   Eigen::CudaStreamDevice stream;
@@ -139,7 +138,7 @@ void test_cuda_contractions() {
   gpu_device.deallocate(d_float2);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
-}
+}*/
 
 
 void test_cuda_reductions() {
@@ -183,7 +182,7 @@ void test_cuda_reductions() {
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
-*/
+
 
 #endif
 
@@ -191,9 +190,19 @@ void test_cuda_reductions() {
 void test_cxx11_tensor_of_float16_cuda()
 {
 #ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-//  CALL_SUBTEST_2(test_cuda_contractions());
-//  CALL_SUBTEST_3(test_cuda_reductions());
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice device(&stream);
+  if (device.majorDeviceVersion() > 5 ||
+      (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
+    CALL_SUBTEST_1(test_cuda_conversion());
+    CALL_SUBTEST_1(test_cuda_elementwise());
+//    CALL_SUBTEST_2(test_cuda_contractions());
+    CALL_SUBTEST_3(test_cuda_reductions());
+  }
+  else {
+   std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
+  }
+#else
+  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
 #endif
 }

From 670db7988d6903fbb51c449383c4d5162d83caaf Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 13:03:26 -0800
Subject: [PATCH 17/18] Updated the contraction code to make it compatible with
 half floats.

---
 .../CXX11/src/Tensor/TensorContractionCuda.h  | 100 ++++++++++--------
 1 file changed, 54 insertions(+), 46 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index a5f3debc4..f5b539c7e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -99,23 +99,23 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 
 #define prefetchIntoRegisters(base_k)                           \
   {                                                             \
-    lhs_pf0 = Scalar(0);                                        \
-    lhs_pf1 = Scalar(0);                                        \
-    lhs_pf2 = Scalar(0);                                        \
-    lhs_pf3 = Scalar(0);                                        \
-    lhs_pf4 = Scalar(0);                                        \
-    lhs_pf5 = Scalar(0);                                        \
-    lhs_pf6 = Scalar(0);                                        \
-    lhs_pf7 = Scalar(0);                                        \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
                                                                 \
-    rhs_pf0 = Scalar(0);                                        \
-    rhs_pf1 = Scalar(0);                                        \
-    rhs_pf2 = Scalar(0);                                        \
-    rhs_pf3 = Scalar(0);                                        \
-    rhs_pf4 = Scalar(0);                                        \
-    rhs_pf5 = Scalar(0);                                        \
-    rhs_pf6 = Scalar(0);                                        \
-    rhs_pf7 = Scalar(0);                                        \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
                                                                 \
     if (!needs_edge_check || lhs_vert < m_size) {               \
       const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
@@ -261,15 +261,16 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
   // declare and initialize result array
 #define res(i, j) _res_##i##j
 #define initResultRow(i)                        \
-  Scalar res(i, 0) = Scalar(0);                 \
-  Scalar res(i, 1) = Scalar(0);                 \
-  Scalar res(i, 2) = Scalar(0);                 \
-  Scalar res(i, 3) = Scalar(0);                 \
-  Scalar res(i, 4) = Scalar(0);                 \
-  Scalar res(i, 5) = Scalar(0);                 \
-  Scalar res(i, 6) = Scalar(0);                 \
-  Scalar res(i, 7) = Scalar(0);                 \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
 
+  internal::scalar_cast_op<int, Scalar> conv;
   initResultRow(0);
   initResultRow(1);
   initResultRow(2);
@@ -1313,6 +1314,34 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   void evalTyped(Scalar* buffer) const {
     // columns in left side, rows in right side
@@ -1353,28 +1382,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     OutputMapper output(buffer, m);
 
     setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
-    if (internal::is_same<LhsScalar, float>::value &&
-        internal::is_same<RhsScalar, float>::value) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
-      }
-    } else {
-      const Index m_blocks = (m + 63) / 64;
-      const Index n_blocks = (n + 63) / 64;
-      const dim3 num_blocks(m_blocks, n_blocks, 1);
-      const dim3 block_size(8, 8, 8);
-      LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
-    }
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
   }
 };
 

From 46fc23f91c1c5ea21bab67976773c613bd7e4ab0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 19 Feb 2016 13:44:22 -0800
Subject: [PATCH 18/18] Print an error message to stderr when the
 initialization of the CUDA runtime fails. This helps debugging setup issues.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h   | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 3808eb155..c01704e56 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -34,12 +34,23 @@ static void initializeDeviceProp() {
     if (!m_devicePropInitialized) {
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(status)
-      assert(status == cudaSuccess);
+      if (status != cudaSuccess) {
+        std::cerr << "Failed to get the number of CUDA devices: "
+                  << cudaGetErrorString(status)
+                  << std::endl;
+        assert(status == cudaSuccess);
+      }
       m_deviceProperties = new cudaDeviceProp[num_devices];
       for (int i = 0; i < num_devices; ++i) {
         status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
-        assert(status == cudaSuccess);
+        if (status != cudaSuccess) {
+          std::cerr << "Failed to initialize CUDA device #"
+                    << i
+                    << ": "
+                    << cudaGetErrorString(status)
+                    << std::endl;
+          assert(status == cudaSuccess);
+        }
       }
       m_devicePropInitialized = true;
     }