From a1e08fb2a55bf60c81de1687f825d0c3d4e62d22 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Dec 2015 16:30:10 -0800 Subject: [PATCH] Optimized the configuration of the outer reduction cuda kernel --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 20dc72e85..8e250867c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -179,7 +179,10 @@ struct OuterReducer { const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);