mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-03 17:24:11 +08:00
Don't create more cuda blocks than necessary
This commit is contained in:
parent
df31ca3b9e
commit
562078780a
@ -219,9 +219,11 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression&
|
|||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign)
|
if (needs_assign)
|
||||||
{
|
{
|
||||||
const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
|
|
||||||
const int block_size = device.maxCudaThreadsPerBlock();
|
const int block_size = device.maxCudaThreadsPerBlock();
|
||||||
|
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||||
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
@ -236,9 +238,11 @@ inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& e
|
|||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign)
|
if (needs_assign)
|
||||||
{
|
{
|
||||||
const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
|
|
||||||
const int block_size = device.maxCudaThreadsPerBlock();
|
const int block_size = device.maxCudaThreadsPerBlock();
|
||||||
|
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||||
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user