From 217d839816c96dd53d3572bc18489109c85d5266 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 16 Oct 2018 14:53:06 -0700 Subject: [PATCH] Reduce thread scheduling overhead in parallelFor --- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 1612c004b..47025a510 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -214,18 +214,25 @@ struct ThreadPoolDevice { Barrier barrier(static_cast(block_count)); std::function handleRange; handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { - if (lastIdx - firstIdx <= block_size) { - // Single block or less, execute directly. - f(firstIdx, lastIdx); - barrier.Notify(); - return; + while (lastIdx - firstIdx > block_size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; } - // Split into halves and submit to the pool. - Index mid = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, lastIdx); }); - handleRange(firstIdx, mid); + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); }; - handleRange(0, n); + if (block_count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } barrier.Wait(); }