mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-22 04:27:36 +08:00
Made the index type a template parameter to evaluateProductBlockingSizes
Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes.
This commit is contained in:
parent
66b215b742
commit
f629fe95c8
@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
|
|||||||
*
|
*
|
||||||
* \sa setCpuCacheSizes */
|
* \sa setCpuCacheSizes */
|
||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar, int KcFactor>
|
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
||||||
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
|
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||||
{
|
{
|
||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
@ -115,7 +115,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
// registers. However once the latency is hidden there is no point in
|
// registers. However once the latency is hidden there is no point in
|
||||||
// increasing the value of k, so we'll cap it at 320 (value determined
|
// increasing the value of k, so we'll cap it at 320 (value determined
|
||||||
// experimentally).
|
// experimentally).
|
||||||
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
|
||||||
if (k_cache < k) {
|
if (k_cache < k) {
|
||||||
k = k_cache - (k_cache % kr);
|
k = k_cache - (k_cache % kr);
|
||||||
eigen_internal_assert(k > 0);
|
eigen_internal_assert(k > 0);
|
||||||
@ -129,7 +129,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
n = n_cache - (n_cache % nr);
|
n = n_cache - (n_cache % nr);
|
||||||
eigen_internal_assert(n > 0);
|
eigen_internal_assert(n > 0);
|
||||||
} else {
|
} else {
|
||||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l3 > l2) {
|
if (l3 > l2) {
|
||||||
@ -140,7 +140,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
m = m_cache - (m_cache % mr);
|
m = m_cache - (m_cache % mr);
|
||||||
eigen_internal_assert(m > 0);
|
eigen_internal_assert(m > 0);
|
||||||
} else {
|
} else {
|
||||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -157,7 +157,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
// Perhaps it would make more sense to consider k*n*m??
|
// Perhaps it would make more sense to consider k*n*m??
|
||||||
// Note that for very tiny problem, this function should be bypassed anyway
|
// Note that for very tiny problem, this function should be bypassed anyway
|
||||||
// because we use the coefficient-based implementation for them.
|
// because we use the coefficient-based implementation for them.
|
||||||
if((std::max)(k,(std::max)(m,n))<48)
|
if((numext::maxi)(k,(numext::maxi)(m,n))<48)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
typedef typename Traits::ResScalar ResScalar;
|
typedef typename Traits::ResScalar ResScalar;
|
||||||
@ -174,7 +174,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
// We also include a register-level block of the result (mx x nr).
|
// We also include a register-level block of the result (mx x nr).
|
||||||
// (In an ideal world only the lhs panel would stay in L1)
|
// (In an ideal world only the lhs panel would stay in L1)
|
||||||
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
||||||
const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
||||||
const Index old_k = k;
|
const Index old_k = k;
|
||||||
if(k>max_kc)
|
if(k>max_kc)
|
||||||
{
|
{
|
||||||
@ -219,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
||||||
}
|
}
|
||||||
// WARNING Below, we assume that Traits::nr is a power of two.
|
// WARNING Below, we assume that Traits::nr is a power of two.
|
||||||
Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
||||||
if(n>nc)
|
if(n>nc)
|
||||||
{
|
{
|
||||||
// We are really blocking over the columns:
|
// We are really blocking over the columns:
|
||||||
@ -248,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
// we have both L2 and L3, and problem is small enough to be kept in L2
|
// we have both L2 and L3, and problem is small enough to be kept in L2
|
||||||
// Let's choose m such that lhs's block fit in 1/3 of L2
|
// Let's choose m such that lhs's block fit in 1/3 of L2
|
||||||
actual_lm = l2;
|
actual_lm = l2;
|
||||||
max_mc = (std::min<Index>)(576,max_mc);
|
max_mc = (numext::mini<Index>)(576,max_mc);
|
||||||
}
|
}
|
||||||
Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
||||||
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
||||||
else if (mc==0) return;
|
else if (mc==0) return;
|
||||||
m = (m%mc)==0 ? mc
|
m = (m%mc)==0 ? mc
|
||||||
@ -259,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Index>
|
||||||
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
||||||
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
|
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
|
||||||
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
||||||
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
||||||
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -292,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
|||||||
*
|
*
|
||||||
* \sa setCpuCacheSizes */
|
* \sa setCpuCacheSizes */
|
||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar, int KcFactor>
|
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
||||||
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||||
{
|
{
|
||||||
if (!useSpecificBlockingSizes(k, m, n)) {
|
if (!useSpecificBlockingSizes(k, m, n)) {
|
||||||
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
|
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
@ -310,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
|||||||
if (n > nr) n -= n % nr;
|
if (n > nr) n -= n % nr;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar>
|
template<typename LhsScalar, typename RhsScalar, typename Index>
|
||||||
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||||
{
|
{
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
|
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||||
|
Loading…
x
Reference in New Issue
Block a user