diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ce5494182..dc679b3fe 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -79,18 +79,15 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ -#define CEIL(a, b) ((a)+(b)-1)/(b) -template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) +template +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { // Explanations: - // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and - // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed - // per kc x nr vertical small panels where nr is the blocking size along the n dimension - // at the register level. For vectorization purpose, these small vertical panels are unpacked, - // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to - // stay in L1 cache. + // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and + // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed + // per mr x kc horizontal small panels where mr is the blocking size along the m dimension + // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); @@ -108,32 +105,32 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_ nr = Traits::nr, nr_mask = (0xffffffff/nr)*nr }; - SizeType k_cache = (l1-ksub)/kdiv; + Index k_cache = (l1-ksub)/kdiv; if (k_cache < k) { k = k_cache & k_mask; - eigen_assert(k > 0); + eigen_internal_assert(k > 0); } - SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); - SizeType n_per_thread = CEIL(n, num_threads); + Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. - eigen_assert(n_cache >= static_cast(nr)); + eigen_internal_assert(n_cache >= static_cast(nr)); n = n_cache & nr_mask; - eigen_assert(n > 0); + eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); } if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. - SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); - SizeType m_per_thread = CEIL(m, num_threads); - if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + Index m_per_thread = numext::div_ceil(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { m = m_cache & mr_mask; - eigen_assert(m > 0); + eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); } } } @@ -141,19 +138,19 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_ // In unit tests we do not want to use extra large matrices, // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif } } -template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) +template +inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { computeProductBlockingSizes(k, m, n, num_threads); } diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index f3bafd5af..3be9e6ca5 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -284,6 +284,14 @@ template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif +// Integer division with rounding up. +// T is assumed to be an integer type with a>=0, and b>0 +template +T div_ceil(const T &a, const T &b) +{ + return (a+b-1) / b; +} + } // end namespace numext } // end namespace Eigen