mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 20:56:00 +08:00
Made the blocking computation aware of the l3 cache
Also optimized the blocking parameters to take into account the number of threads used for a computation
This commit is contained in:
parent
dba55041ab
commit
bfdd9f3ac9
@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
|
|||||||
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
|
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
|
||||||
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
|
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
|
||||||
|
|
||||||
BlockingType blocking(rhs.rows(), rhs.cols(), size);
|
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
|
||||||
|
|
||||||
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
||||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||||
|
@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** \internal */
|
/** \internal */
|
||||||
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
|
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
|
||||||
{
|
{
|
||||||
static std::ptrdiff_t m_l1CacheSize = 0;
|
static bool m_cache_sizes_initialized = false;
|
||||||
static std::ptrdiff_t m_l2CacheSize = 0;
|
static std::ptrdiff_t m_l1CacheSize = 32*1024;
|
||||||
if(m_l2CacheSize==0)
|
static std::ptrdiff_t m_l2CacheSize = 256*1024;
|
||||||
|
static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
|
||||||
|
|
||||||
|
if(!m_cache_sizes_initialized)
|
||||||
{
|
{
|
||||||
m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
|
int l1CacheSize, l2CacheSize, l3CacheSize;
|
||||||
m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
|
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
|
||||||
|
m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
|
||||||
|
m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
|
||||||
|
m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
|
||||||
|
m_cache_sizes_initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(action==SetAction)
|
if(action==SetAction)
|
||||||
{
|
{
|
||||||
// set the cpu cache size and cache all block sizes from a global cache size in byte
|
// set the cpu cache size and cache all block sizes from a global cache size in byte
|
||||||
eigen_internal_assert(l1!=0 && l2!=0);
|
eigen_internal_assert(l1!=0 && l2!=0);
|
||||||
m_l1CacheSize = *l1;
|
m_l1CacheSize = *l1;
|
||||||
m_l2CacheSize = *l2;
|
m_l2CacheSize = *l2;
|
||||||
|
m_l3CacheSize = *l3;
|
||||||
}
|
}
|
||||||
else if(action==GetAction)
|
else if(action==GetAction)
|
||||||
{
|
{
|
||||||
eigen_internal_assert(l1!=0 && l2!=0);
|
eigen_internal_assert(l1!=0 && l2!=0);
|
||||||
*l1 = m_l1CacheSize;
|
*l1 = m_l1CacheSize;
|
||||||
*l2 = m_l2CacheSize;
|
*l2 = m_l2CacheSize;
|
||||||
|
*l3 = m_l3CacheSize;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
|
|||||||
* - the number of scalars that fit into a packet (when vectorization is enabled).
|
* - the number of scalars that fit into a packet (when vectorization is enabled).
|
||||||
*
|
*
|
||||||
* \sa setCpuCacheSizes */
|
* \sa setCpuCacheSizes */
|
||||||
|
#define CEIL(a, b) ((a)+(b)-1)/(b)
|
||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
|
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
|
||||||
void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
|
void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
|
||||||
{
|
{
|
||||||
EIGEN_UNUSED_VARIABLE(n);
|
|
||||||
// Explanations:
|
// Explanations:
|
||||||
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
|
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
|
||||||
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
|
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
|
||||||
@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
|
|||||||
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
|
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
|
||||||
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
|
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
|
||||||
// stay in L1 cache.
|
// stay in L1 cache.
|
||||||
std::ptrdiff_t l1, l2;
|
std::ptrdiff_t l1, l2, l3;
|
||||||
|
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||||
|
|
||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
if (num_threads > 1) {
|
||||||
enum {
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
kdiv = KcFactor * 2 * Traits::nr
|
typedef typename Traits::ResScalar ResScalar;
|
||||||
* Traits::RhsProgress * sizeof(RhsScalar),
|
enum {
|
||||||
mr = gebp_traits<LhsScalar,RhsScalar>::mr,
|
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||||
mr_mask = (0xffffffff/mr)*mr
|
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
||||||
};
|
k_mask = (0xffffffff/8)*8,
|
||||||
|
|
||||||
manage_caching_sizes(GetAction, &l1, &l2);
|
mr = Traits::mr,
|
||||||
|
mr_mask = (0xffffffff/mr)*mr,
|
||||||
|
|
||||||
// k = std::min<SizeType>(k, l1/kdiv);
|
nr = Traits::nr,
|
||||||
// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
|
nr_mask = (0xffffffff/nr)*nr
|
||||||
// if(_m<m) m = _m & mr_mask;
|
};
|
||||||
|
SizeType k_cache = (l1-ksub)/kdiv;
|
||||||
// In unit tests we do not want to use extra large matrices,
|
if (k_cache < k) {
|
||||||
// so we reduce the block size to check the blocking strategy is not flawed
|
k = k_cache & k_mask;
|
||||||
|
eigen_assert(k > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
|
||||||
|
SizeType n_per_thread = CEIL(n, num_threads);
|
||||||
|
if (n_cache <= n_per_thread) {
|
||||||
|
// Don't exceed the capacity of the l2 cache.
|
||||||
|
eigen_assert(n_cache >= static_cast<SizeType>(nr));
|
||||||
|
n = n_cache & nr_mask;
|
||||||
|
eigen_assert(n > 0);
|
||||||
|
} else {
|
||||||
|
n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (l3 > l2) {
|
||||||
|
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
|
||||||
|
SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||||
|
SizeType m_per_thread = CEIL(m, num_threads);
|
||||||
|
if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
|
||||||
|
m = m_cache & mr_mask;
|
||||||
|
eigen_assert(m > 0);
|
||||||
|
} else {
|
||||||
|
m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// In unit tests we do not want to use extra large matrices,
|
||||||
|
// so we reduce the block size to check the blocking strategy is not flawed
|
||||||
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||||
// k = std::min<SizeType>(k,240);
|
k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
|
||||||
// n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
|
n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
|
||||||
// m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
|
m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
|
||||||
|
|
||||||
k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
|
|
||||||
n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
|
|
||||||
m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
|
|
||||||
#else
|
#else
|
||||||
k = std::min<SizeType>(k,24);
|
k = std::min<SizeType>(k,24);
|
||||||
n = std::min<SizeType>(n,384/sizeof(RhsScalar));
|
n = std::min<SizeType>(n,384/sizeof(RhsScalar));
|
||||||
m = std::min<SizeType>(m,384/sizeof(RhsScalar));
|
m = std::min<SizeType>(m,384/sizeof(RhsScalar));
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar, typename SizeType>
|
template<typename LhsScalar, typename RhsScalar, typename SizeType>
|
||||||
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
|
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
|
||||||
{
|
{
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
|
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_FUSE_CJMADD
|
#ifdef EIGEN_HAS_FUSE_CJMADD
|
||||||
@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
|
|||||||
* \sa setCpuCacheSize */
|
* \sa setCpuCacheSize */
|
||||||
inline std::ptrdiff_t l1CacheSize()
|
inline std::ptrdiff_t l1CacheSize()
|
||||||
{
|
{
|
||||||
std::ptrdiff_t l1, l2;
|
std::ptrdiff_t l1, l2, l3;
|
||||||
internal::manage_caching_sizes(GetAction, &l1, &l2);
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||||
return l1;
|
return l1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1855,8 +1893,8 @@ inline std::ptrdiff_t l1CacheSize()
|
|||||||
* \sa setCpuCacheSize */
|
* \sa setCpuCacheSize */
|
||||||
inline std::ptrdiff_t l2CacheSize()
|
inline std::ptrdiff_t l2CacheSize()
|
||||||
{
|
{
|
||||||
std::ptrdiff_t l1, l2;
|
std::ptrdiff_t l1, l2, l3;
|
||||||
internal::manage_caching_sizes(GetAction, &l1, &l2);
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||||
return l2;
|
return l2;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1865,9 +1903,9 @@ inline std::ptrdiff_t l2CacheSize()
|
|||||||
* for the algorithms working per blocks.
|
* for the algorithms working per blocks.
|
||||||
*
|
*
|
||||||
* \sa computeProductBlockingSizes */
|
* \sa computeProductBlockingSizes */
|
||||||
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
|
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
|
||||||
{
|
{
|
||||||
internal::manage_caching_sizes(SetAction, &l1, &l2);
|
internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -299,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
|
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
|
||||||
{
|
{
|
||||||
this->m_mc = ActualRows;
|
this->m_mc = ActualRows;
|
||||||
this->m_nc = ActualCols;
|
this->m_nc = ActualCols;
|
||||||
@ -331,21 +331,21 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
|
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
|
||||||
{
|
{
|
||||||
this->m_mc = Transpose ? cols : rows;
|
this->m_mc = Transpose ? cols : rows;
|
||||||
this->m_nc = Transpose ? rows : cols;
|
this->m_nc = Transpose ? rows : cols;
|
||||||
this->m_kc = depth;
|
this->m_kc = depth;
|
||||||
|
|
||||||
if(full_rows)
|
if(l3_blocking)
|
||||||
|
{
|
||||||
|
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
|
||||||
|
}
|
||||||
|
else // no l3 blocking
|
||||||
{
|
{
|
||||||
DenseIndex m = this->m_mc;
|
DenseIndex m = this->m_mc;
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
|
|
||||||
}
|
|
||||||
else // full columns
|
|
||||||
{
|
|
||||||
DenseIndex n = this->m_nc;
|
DenseIndex n = this->m_nc;
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
|
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
m_sizeA = this->m_mc * this->m_kc;
|
m_sizeA = this->m_mc * this->m_kc;
|
||||||
@ -451,7 +451,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
|
|||||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||||
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
|
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
|
||||||
|
|
||||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
|
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||||
|
|
||||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
|
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
|
||||||
}
|
}
|
||||||
|
@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||||||
Index kc = depth; // cache block size along the K direction
|
Index kc = depth; // cache block size along the K direction
|
||||||
Index mc = size; // cache block size along the M direction
|
Index mc = size; // cache block size along the M direction
|
||||||
Index nc = size; // cache block size along the N direction
|
Index nc = size; // cache block size along the N direction
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
|
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
|
||||||
// !!! mc must be a multiple of nr:
|
// !!! mc must be a multiple of nr:
|
||||||
if(mc > Traits::nr)
|
if(mc > Traits::nr)
|
||||||
mc = (mc/Traits::nr)*Traits::nr;
|
mc = (mc/Traits::nr)*Traits::nr;
|
||||||
|
@ -49,8 +49,8 @@ inline void initParallel()
|
|||||||
{
|
{
|
||||||
int nbt;
|
int nbt;
|
||||||
internal::manage_multi_threading(GetAction, &nbt);
|
internal::manage_multi_threading(GetAction, &nbt);
|
||||||
std::ptrdiff_t l1, l2;
|
std::ptrdiff_t l1, l2, l3;
|
||||||
internal::manage_caching_sizes(GetAction, &l1, &l2);
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \returns the max number of threads reserved for Eigen
|
/** \returns the max number of threads reserved for Eigen
|
||||||
|
@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|||||||
Index kc = size; // cache block size along the K direction
|
Index kc = size; // cache block size along the K direction
|
||||||
Index mc = rows; // cache block size along the M direction
|
Index mc = rows; // cache block size along the M direction
|
||||||
Index nc = cols; // cache block size along the N direction
|
Index nc = cols; // cache block size along the N direction
|
||||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
|
||||||
// kc must smaller than mc
|
// kc must smaller than mc
|
||||||
kc = (std::min)(kc,mc);
|
kc = (std::min)(kc,mc);
|
||||||
|
|
||||||
@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
|||||||
LhsMapper lhs(_lhs,lhsStride);
|
LhsMapper lhs(_lhs,lhsStride);
|
||||||
ResMapper res(_res,resStride);
|
ResMapper res(_res,resStride);
|
||||||
|
|
||||||
Index kc = size; // cache block size along the K direction
|
Index kc = size; // cache block size along the K direction
|
||||||
Index mc = rows; // cache block size along the M direction
|
Index mc = rows; // cache block size along the M direction
|
||||||
Index nc = cols; // cache block size along the N direction
|
Index nc = cols; // cache block size along the N direction
|
||||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*cols;
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||||
|
@ -412,7 +412,7 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
|||||||
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
|
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
|
||||||
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
|
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
|
||||||
|
|
||||||
BlockingType blocking(stripedRows, stripedCols, stripedDepth);
|
BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
|
||||||
|
|
||||||
internal::product_triangular_matrix_matrix<Scalar, Index,
|
internal::product_triangular_matrix_matrix<Scalar, Index,
|
||||||
Mode, LhsIsTriangular,
|
Mode, LhsIsTriangular,
|
||||||
|
@ -81,8 +81,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||||||
|
|
||||||
// the goal here is to subdivise the Rhs panels such that we keep some cache
|
// the goal here is to subdivise the Rhs panels such that we keep some cache
|
||||||
// coherence when accessing the rhs elements
|
// coherence when accessing the rhs elements
|
||||||
std::ptrdiff_t l1, l2;
|
std::ptrdiff_t l1, l2, l3;
|
||||||
manage_caching_sizes(GetAction, &l1, &l2);
|
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||||
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
|
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
|
||||||
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
|
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
|
|||||||
else matrix(c, *m, *n, *ldc) *= beta;
|
else matrix(c, *m, *n, *ldc) *= beta;
|
||||||
}
|
}
|
||||||
|
|
||||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true);
|
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
|
||||||
|
|
||||||
int code = OP(*opa) | (OP(*opb) << 2);
|
int code = OP(*opa) | (OP(*opb) << 2);
|
||||||
func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
|
func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
|
||||||
@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
|
|||||||
|
|
||||||
if(SIDE(*side)==LEFT)
|
if(SIDE(*side)==LEFT)
|
||||||
{
|
{
|
||||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
|
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
|
||||||
func[code](*m, *n, a, *lda, b, *ldb, blocking);
|
func[code](*m, *n, a, *lda, b, *ldb, blocking);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
|
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
|
||||||
func[code](*n, *m, a, *lda, b, *ldb, blocking);
|
func[code](*n, *m, a, *lda, b, *ldb, blocking);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
|
|||||||
|
|
||||||
if(SIDE(*side)==LEFT)
|
if(SIDE(*side)==LEFT)
|
||||||
{
|
{
|
||||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
|
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
|
||||||
func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
|
func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
|
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
|
||||||
func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
|
func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
|
|||||||
else if(*n<0) info = 3;
|
else if(*n<0) info = 3;
|
||||||
else if(*k<0) info = 4;
|
else if(*k<0) info = 4;
|
||||||
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7;
|
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7;
|
||||||
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9;
|
else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9;
|
||||||
else if(*ldc<std::max(1,*n)) info = 12;
|
else if(*ldc<std::max(1,*n)) info = 12;
|
||||||
if(info)
|
if(info)
|
||||||
return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
|
return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
|
||||||
|
@ -39,15 +39,16 @@ void test_product_large()
|
|||||||
// check the functions to setup blocking sizes compile and do not segfault
|
// check the functions to setup blocking sizes compile and do not segfault
|
||||||
// FIXME check they do what they are supposed to do !!
|
// FIXME check they do what they are supposed to do !!
|
||||||
std::ptrdiff_t l1 = internal::random<int>(10000,20000);
|
std::ptrdiff_t l1 = internal::random<int>(10000,20000);
|
||||||
std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
|
std::ptrdiff_t l2 = internal::random<int>(100000,200000);
|
||||||
setCpuCacheSizes(l1,l2);
|
std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
|
||||||
|
setCpuCacheSizes(l1,l2,l3);
|
||||||
VERIFY(l1==l1CacheSize());
|
VERIFY(l1==l1CacheSize());
|
||||||
VERIFY(l2==l2CacheSize());
|
VERIFY(l2==l2CacheSize());
|
||||||
std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
|
std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
|
||||||
std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
|
std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
|
||||||
std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
|
std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
|
||||||
// only makes sure it compiles fine
|
// only makes sure it compiles fine
|
||||||
internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
|
internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -55,7 +55,7 @@
|
|||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
|
||||||
//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
|
||||||
|
@ -766,7 +766,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
|
typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
|
||||||
|
|
||||||
// Sizes of the blocks to load in cache. See the Goto paper for details.
|
// Sizes of the blocks to load in cache. See the Goto paper for details.
|
||||||
BlockingType blocking(m, n, k, true);
|
BlockingType blocking(m, n, k, 1, true);
|
||||||
const Index kc = blocking.kc();
|
const Index kc = blocking.kc();
|
||||||
const Index mc = (std::min)(m, blocking.mc());
|
const Index mc = (std::min)(m, blocking.mc());
|
||||||
const Index nc = (std::min)(n, blocking.nc());
|
const Index nc = (std::min)(n, blocking.nc());
|
||||||
|
@ -152,7 +152,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
Index mc = m;
|
Index mc = m;
|
||||||
Index nc = n;
|
Index nc = n;
|
||||||
Index kc = k;
|
Index kc = k;
|
||||||
internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc/*, num_threads*/);
|
internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
|
||||||
eigen_assert(mc <= m);
|
eigen_assert(mc <= m);
|
||||||
eigen_assert(nc <= n);
|
eigen_assert(nc <= n);
|
||||||
eigen_assert(kc <= k);
|
eigen_assert(kc <= k);
|
||||||
@ -197,9 +197,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
|
|
||||||
// this should really be numBlockAs * n_blocks;
|
// this should really be numBlockAs * n_blocks;
|
||||||
const Index num_kernel_promises = num_threads * n_blocks;
|
const Index num_kernel_promises = num_threads * n_blocks;
|
||||||
Promise p;
|
std::vector<Promise> kernel_promises(num_kernel_promises);
|
||||||
p.set_value();
|
for (int i = 0; i < kernel_promises.size(); ++i) {
|
||||||
std::vector<Promise> kernel_promises(num_kernel_promises, p);
|
kernel_promises[i].set_value();
|
||||||
|
}
|
||||||
|
|
||||||
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
|
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
|
||||||
const Index k_start = k_block_idx * kc;
|
const Index k_start = k_block_idx * kc;
|
||||||
@ -275,8 +276,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
need_to_pack, // need_to_pack
|
need_to_pack, // need_to_pack
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef decltype(Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>) Func;
|
this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
|
||||||
this->m_device.enqueueNoFuture<Func, packRKArg>(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -338,7 +338,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
|
actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
|
||||||
|
|
||||||
const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
|
const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
|
||||||
eigen_assert(!(*arg.kernel_promises)[set_idx].ready());
|
|
||||||
(*arg.kernel_promises)[set_idx].set_value();
|
(*arg.kernel_promises)[set_idx].set_value();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user