Reduce overhead for small tensors and cheap ops by short-circuiting the const computation and block size calculation in parallelFor.

This commit is contained in:
Rasmus Munk Larsen 2016-05-17 16:06:00 -07:00
parent 86ae94462e
commit f519fca72b
3 changed files with 24 additions and 34 deletions

View File

@ -10,9 +10,6 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
// Turn on the cost model by default
#define EIGEN_USE_COST_MODEL
namespace Eigen { namespace Eigen {
/** \class TensorEvaluator /** \class TensorEvaluator

View File

@ -152,23 +152,25 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
{ {
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
[&evaluator](Index first, Index last) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
});
#else
size_t num_threads = device.numThreads(); size_t num_threads = device.numThreads();
#ifdef EIGEN_USE_COST_MODEL ThreadOpCost cost;
if (num_threads > 1) { if (num_threads > 1) {
cost = evaluator.costPerCoeff(Vectorizable)
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads); size, evaluator.costPerCoeff(Vectorizable), num_threads);
} }
#endif
if (num_threads == 1) { if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else { } else {
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
device.parallelFor(
size, cost,
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
[&evaluator](Index first, Index last) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first,
last);
});
#else
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize; const Index numblocks = size / blocksize;
@ -184,8 +186,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
&evaluator, numblocks * blocksize, size); &evaluator, numblocks * blocksize, size);
} }
barrier.Wait(); barrier.Wait();
#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
} }
#endif // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
} }
evaluator.cleanup(); evaluator.cleanup();
} }

View File

@ -248,16 +248,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
*output = reducer.finalize(reducer.initialize()); *output = reducer.finalize(reducer.initialize());
return; return;
} }
#ifdef EIGEN_USE_COST_MODEL int num_threads = device.numThreads();
const TensorOpCost cost = if (num_threads > 1) {
self.m_impl.costPerCoeff(Vectorizable) + const TensorOpCost cost =
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, self.m_impl.costPerCoeff(Vectorizable) +
PacketSize); TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( PacketSize);
num_coeffs, cost, device.numThreads()); num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
#else num_coeffs, cost, device.numThreads());
const int num_threads = device.numThreads(); }
#endif
if (num_threads == 1) { if (num_threads == 1) {
*output = *output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@ -472,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
static bool size_large_enough(Index total_size) {
#ifndef EIGEN_USE_COST_MODEL
return total_size > 1024 * 1024;
#else
return true || total_size;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL); m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible. // Use the FullReducer if possible.
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && if (RunningFullReduction &&
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
(!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { !RunningOnGPU)) {
bool need_assign = false; bool need_assign = false;
if (!data) { if (!data) {
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType))); m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));