diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index df15c6204..fd9919829 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::maxi(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1);