From aed4cb1269d52d0ff0e69c8aa6d89c804185b18f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Jan 2016 21:45:14 -0800 Subject: [PATCH] Use warp shuffles instead of shared memory access to speedup the inner reduction kernel. --- .../CXX11/src/Tensor/TensorReductionCuda.h | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 54ab34ba1..82ea09f07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -132,8 +132,6 @@ struct FullReducer { }; -extern __shared__ float temp[]; - template __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, @@ -183,17 +181,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } - temp[threadIdx.x] = reduced_val; +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + } - __syncthreads(); - const int warp_id = threadIdx.x & 31; - if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]); - if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]); - if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]); - if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]); - if (warp_id < 1) { - reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]); - atomicReduce(&(output[row]), temp[threadIdx.x], reducer); + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); } } @@ -224,7 +218,7 @@ struct InnerReducer { EIGEN_UNUSED_VARIABLE(num_blocks) LAUNCH_CUDA_KERNEL((InnerReductionKernel), - num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } };