From 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 Mon Sep 17 00:00:00 2001
From: Jeremy Barnes <jeremy@barneso.com>
Date: Sun, 10 Jan 2016 22:39:13 -0500
Subject: [PATCH 1/2] Alternative way of forcing instantiation of device
 kernels without causing warnings or requiring device to device kernel
 invocations.

This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h  | 10 ++++++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h   |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h       |  4 ++--
 3 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index af140a68b..359a01b8f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -242,6 +242,16 @@ struct GpuDevice {
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
 
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
+    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+  assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                  \
+    { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
+
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index fd7064459..9a66e81f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -506,7 +506,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 558d0c83d..374edb605 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -116,7 +116,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(false && "Should only be called on floats");
+    eigen_assert(false && "Should only be called on floats");
   }
 
   static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
 };

From 91678f489a89b1f4a393cd3b703e67922b5c4257 Mon Sep 17 00:00:00 2001
From: Jeremy Barnes <jeremy@barneso.com>
Date: Sun, 10 Jan 2016 22:44:45 -0500
Subject: [PATCH 2/2] Cleaned up double-defined macro from last commit

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 359a01b8f..c74613873 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -238,10 +238,6 @@ struct GpuDevice {
 
 };
 
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
-  assert(cudaGetLastError() == cudaSuccess);
-
 #ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
     (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \