Don't create more cuda blocks than necessary

2025-09-18 04:13:14 +08:00 · 2015-11-23 11:00:10 -08:00 · 2015-11-23 11:00:10 -08:00 · 562078780a
commit 562078780a
parent df31ca3b9e
1 changed files with 6 additions and 2 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -219,9 +219,11 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression&
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
  {
    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
    const int block_size = device.maxCudaThreadsPerBlock();
    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();
@ -236,9 +238,11 @@ inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& e
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
  {
    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
    const int block_size = device.maxCudaThreadsPerBlock();
    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();