From 75a7fa1919af749ba79a2b70c542320707837f61 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 18 Dec 2015 14:07:31 -0800
Subject: [PATCH] Doubled the speed of full reductions on GPUs.

---
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 49102fca2..af1b9432c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -87,15 +87,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
   }
 
   typename Self::CoeffReturnType accum = reducer.initialize();
-  for (Index i = 0; i < NumPerThread; ++i) {
-    const Index index = first_index + i * BlockSize;
-    if (index >= num_coeffs) {
-      break;
-    }
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
     typename Self::CoeffReturnType val = input.m_impl.coeff(index);
     reducer.reduce(val, &accum);
   }
 
+#pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
     reducer.reduce(__shfl_down(accum, offset), &accum);
   }