Doubled the speed of full reductions on GPUs.

This commit is contained in:
Benoit Steiner 2015-12-18 14:07:31 -08:00
parent 3abd8470ca
commit 75a7fa1919

View File

@ -87,15 +87,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
} }
typename Self::CoeffReturnType accum = reducer.initialize(); typename Self::CoeffReturnType accum = reducer.initialize();
for (Index i = 0; i < NumPerThread; ++i) { Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
const Index index = first_index + i * BlockSize; for (Index i = 0; i < max_iter; i+=BlockSize) {
if (index >= num_coeffs) { const Index index = first_index + i;
break; eigen_assert(index < num_coeffs);
}
typename Self::CoeffReturnType val = input.m_impl.coeff(index); typename Self::CoeffReturnType val = input.m_impl.coeff(index);
reducer.reduce(val, &accum); reducer.reduce(val, &accum);
} }
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
reducer.reduce(__shfl_down(accum, offset), &accum); reducer.reduce(__shfl_down(accum, offset), &accum);
} }