From 5401fbcc50747583b0d47e195f23f988f7dfac5e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Apr 2015 16:44:10 -0700 Subject: [PATCH] Improved the blocking strategy to speedup multithreaded tensor contractions. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 24623963b..320f96a39 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n nr = Traits::nr, nr_mask = (0xffffffff/nr)*nr }; - Index k_cache = (l1-ksub)/kdiv; + // Increasing k gives us more time to prefetch the content of the "C" + // registers. However once the latency is hidden there is no point in + // increasing the value of k, so we'll cap it at 320 (value determined + // experimentally). + const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); if (k_cache < k) { k = k_cache & k_mask; eigen_internal_assert(k > 0); } - Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); - Index n_per_thread = numext::div_ceil(n, num_threads); + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast(nr)); @@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. - Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); - Index m_per_thread = numext::div_ceil(m, num_threads); + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { m = m_cache & mr_mask; eigen_internal_assert(m > 0);