mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-30 00:32:01 +08:00
Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs).
This commit is contained in:
parent
1e80bddde3
commit
1d23430628
@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
enum {
|
||||
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
||||
k_mask = -8,
|
||||
|
||||
kr = 8,
|
||||
mr = Traits::mr,
|
||||
mr_mask = -mr,
|
||||
|
||||
nr = Traits::nr,
|
||||
nr_mask = -nr
|
||||
};
|
||||
// Increasing k gives us more time to prefetch the content of the "C"
|
||||
// registers. However once the latency is hidden there is no point in
|
||||
@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// experimentally).
|
||||
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
||||
if (k_cache < k) {
|
||||
k = k_cache & k_mask;
|
||||
k = k_cache - (k_cache % kr);
|
||||
eigen_internal_assert(k > 0);
|
||||
}
|
||||
|
||||
@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
if (n_cache <= n_per_thread) {
|
||||
// Don't exceed the capacity of the l2 cache.
|
||||
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
||||
n = n_cache & nr_mask;
|
||||
n = n_cache - (n_cache % nr);
|
||||
eigen_internal_assert(n > 0);
|
||||
} else {
|
||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
|
||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
||||
}
|
||||
|
||||
if (l3 > l2) {
|
||||
@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
||||
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
||||
m = m_cache & mr_mask;
|
||||
m = m_cache - (m_cache % mr);
|
||||
eigen_internal_assert(m > 0);
|
||||
} else {
|
||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
|
||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user