From 75a7fa1919af749ba79a2b70c542320707837f61 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 18 Dec 2015 14:07:31 -0800 Subject: [PATCH] Doubled the speed of full reductions on GPUs. --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 49102fca2..af1b9432c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -87,15 +87,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } typename Self::CoeffReturnType accum = reducer.initialize(); - for (Index i = 0; i < NumPerThread; ++i) { - const Index index = first_index + i * BlockSize; - if (index >= num_coeffs) { - break; - } + Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread*BlockSize); + for (Index i = 0; i < max_iter; i+=BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); typename Self::CoeffReturnType val = input.m_impl.coeff(index); reducer.reduce(val, &accum); } +#pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { reducer.reduce(__shfl_down(accum, offset), &accum); }