diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 956672771..d93e1de1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -219,9 +219,11 @@ inline void TensorExecutor::run(const Expression& const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash if we're called with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); @@ -236,9 +238,11 @@ inline void TensorExecutor::run(const Expression& e const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash if we're called with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup();