From f629fe95c8973706a9deacd586d1b16d6ea0c5c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 13:11:19 -0700 Subject: [PATCH] Made the index type a template parameter to evaluateProductBlockingSizes Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes. --- .../Core/products/GeneralBlockPanelKernel.h | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bd559dc6a..5b0473598 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * * \sa setCpuCacheSizes */ -template +template void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { typedef gebp_traits Traits; @@ -115,7 +115,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); + const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -129,7 +129,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); + n = (numext::mini)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -140,7 +140,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); + m = (numext::mini)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -157,7 +157,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. - if((std::max)(k,(std::max)(m,n))<48) + if((numext::maxi)(k,(numext::maxi)(m,n))<48) return; typedef typename Traits::ResScalar ResScalar; @@ -174,7 +174,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = std::max(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index max_kc = numext::maxi(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { @@ -219,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); } // WARNING Below, we assume that Traits::nr is a power of two. - Index nc = std::min(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + Index nc = numext::mini(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) { // We are really blocking over the columns: @@ -248,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = (std::min)(576,max_mc); + max_mc = (numext::mini)(576,max_mc); } - Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + Index mc = (numext::mini)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; else if (mc==0) return; m = (m%mc)==0 ? mc @@ -259,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n } } +template inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) { #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + k = numext::mini(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); return true; } #else @@ -292,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) * * \sa setCpuCacheSizes */ -template +template void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { if (!useSpecificBlockingSizes(k, m, n)) { - evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); + evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); } typedef gebp_traits Traits; @@ -310,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads if (n > nr) n -= n % nr; } -template +template inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { - computeProductBlockingSizes(k, m, n, num_threads); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD