mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-08 22:21:49 +08:00
Reduce overhead for small tensors and cheap ops by short-circuiting the const computation and block size calculation in parallelFor.
This commit is contained in:
parent
86ae94462e
commit
f519fca72b
@ -10,9 +10,6 @@
|
|||||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||||
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||||
|
|
||||||
// Turn on the cost model by default
|
|
||||||
#define EIGEN_USE_COST_MODEL
|
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
/** \class TensorEvaluator
|
/** \class TensorEvaluator
|
||||||
|
@ -152,23 +152,25 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
|||||||
{
|
{
|
||||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
|
|
||||||
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
|
|
||||||
[&evaluator](Index first, Index last) {
|
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
|
|
||||||
});
|
|
||||||
#else
|
|
||||||
size_t num_threads = device.numThreads();
|
size_t num_threads = device.numThreads();
|
||||||
#ifdef EIGEN_USE_COST_MODEL
|
ThreadOpCost cost;
|
||||||
if (num_threads > 1) {
|
if (num_threads > 1) {
|
||||||
|
cost = evaluator.costPerCoeff(Vectorizable)
|
||||||
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||||
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
if (num_threads == 1) {
|
if (num_threads == 1) {
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
||||||
} else {
|
} else {
|
||||||
|
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
|
||||||
|
device.parallelFor(
|
||||||
|
size, cost,
|
||||||
|
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
|
||||||
|
[&evaluator](Index first, Index last) {
|
||||||
|
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first,
|
||||||
|
last);
|
||||||
|
});
|
||||||
|
#else
|
||||||
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
||||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||||
const Index numblocks = size / blocksize;
|
const Index numblocks = size / blocksize;
|
||||||
@ -184,8 +186,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
|||||||
&evaluator, numblocks * blocksize, size);
|
&evaluator, numblocks * blocksize, size);
|
||||||
}
|
}
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
|
#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
|
||||||
}
|
}
|
||||||
#endif // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
|
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
|
@ -248,16 +248,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
|
|||||||
*output = reducer.finalize(reducer.initialize());
|
*output = reducer.finalize(reducer.initialize());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#ifdef EIGEN_USE_COST_MODEL
|
int num_threads = device.numThreads();
|
||||||
const TensorOpCost cost =
|
if (num_threads > 1) {
|
||||||
self.m_impl.costPerCoeff(Vectorizable) +
|
const TensorOpCost cost =
|
||||||
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
self.m_impl.costPerCoeff(Vectorizable) +
|
||||||
PacketSize);
|
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
||||||
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
PacketSize);
|
||||||
num_coeffs, cost, device.numThreads());
|
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||||
#else
|
num_coeffs, cost, device.numThreads());
|
||||||
const int num_threads = device.numThreads();
|
}
|
||||||
#endif
|
|
||||||
if (num_threads == 1) {
|
if (num_threads == 1) {
|
||||||
*output =
|
*output =
|
||||||
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||||
@ -472,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
static bool size_large_enough(Index total_size) {
|
|
||||||
#ifndef EIGEN_USE_COST_MODEL
|
|
||||||
return total_size > 1024 * 1024;
|
|
||||||
#else
|
|
||||||
return true || total_size;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||||
m_impl.evalSubExprsIfNeeded(NULL);
|
m_impl.evalSubExprsIfNeeded(NULL);
|
||||||
|
|
||||||
// Use the FullReducer if possible.
|
// Use the FullReducer if possible.
|
||||||
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
if (RunningFullReduction &&
|
||||||
|
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||||
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
||||||
(!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
|
!RunningOnGPU)) {
|
||||||
|
|
||||||
bool need_assign = false;
|
bool need_assign = false;
|
||||||
if (!data) {
|
if (!data) {
|
||||||
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
|
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user