mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Tweak cost model for tensor contraction when parallelizing over the inner dimension.
https://bitbucket.org/snippets/rmlarsen/MexxLo
This commit is contained in:
parent
9a3f06d836
commit
039ee52125
@ -1169,7 +1169,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
|
TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
|
||||||
// Compute cost.
|
// Compute cost.
|
||||||
const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
|
const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n);
|
TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
|
||||||
// Output stores.
|
// Output stores.
|
||||||
cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
|
cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
|
||||||
TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
|
TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
|
||||||
@ -1192,8 +1192,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
int num_threads = 1;
|
int num_threads = 1;
|
||||||
double min_cost = total_parallel_cost;
|
double min_cost = total_parallel_cost;
|
||||||
double kPerThreadOverHead = 4000;
|
double kPerThreadOverHead = 4000;
|
||||||
double kFixedOverHead = 100000;
|
double kFixedOverHead = 50000;
|
||||||
for (int nt = 2; nt <= this->m_device.numThreads(); nt++) {
|
for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
|
||||||
double sequential_cost =
|
double sequential_cost =
|
||||||
kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
|
kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
|
||||||
double parallel_cost = total_parallel_cost / nt + sequential_cost;
|
double parallel_cost = total_parallel_cost / nt + sequential_cost;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user