mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-30 16:52:01 +08:00
Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs).
This commit is contained in:
parent
1e80bddde3
commit
1d23430628
@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
enum {
|
enum {
|
||||||
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||||
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
||||||
k_mask = -8,
|
kr = 8,
|
||||||
|
|
||||||
mr = Traits::mr,
|
mr = Traits::mr,
|
||||||
mr_mask = -mr,
|
|
||||||
|
|
||||||
nr = Traits::nr,
|
nr = Traits::nr,
|
||||||
nr_mask = -nr
|
|
||||||
};
|
};
|
||||||
// Increasing k gives us more time to prefetch the content of the "C"
|
// Increasing k gives us more time to prefetch the content of the "C"
|
||||||
// registers. However once the latency is hidden there is no point in
|
// registers. However once the latency is hidden there is no point in
|
||||||
@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
// experimentally).
|
// experimentally).
|
||||||
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
||||||
if (k_cache < k) {
|
if (k_cache < k) {
|
||||||
k = k_cache & k_mask;
|
k = k_cache - (k_cache % kr);
|
||||||
eigen_internal_assert(k > 0);
|
eigen_internal_assert(k > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
if (n_cache <= n_per_thread) {
|
if (n_cache <= n_per_thread) {
|
||||||
// Don't exceed the capacity of the l2 cache.
|
// Don't exceed the capacity of the l2 cache.
|
||||||
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
||||||
n = n_cache & nr_mask;
|
n = n_cache - (n_cache % nr);
|
||||||
eigen_internal_assert(n > 0);
|
eigen_internal_assert(n > 0);
|
||||||
} else {
|
} else {
|
||||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
|
n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l3 > l2) {
|
if (l3 > l2) {
|
||||||
@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||||
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
||||||
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
||||||
m = m_cache & mr_mask;
|
m = m_cache - (m_cache % mr);
|
||||||
eigen_internal_assert(m > 0);
|
eigen_internal_assert(m > 0);
|
||||||
} else {
|
} else {
|
||||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
|
m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user