Add a simple cost model to prevent Eigen's parallel GEMM from using too many threads when the inner dimension is small.

Timing for square matrices is unchanged, but both CPU and Wall time are significantly improved for skinny matrices. The benchmarks below are for multiplying NxK * KxN matrices with test names of the form BM_OuterishProd/N/K.

Improvements in Wall time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    3088      1610    +47.9%
BM_OuterishProd/64/4                    3562      2414    +32.2%
BM_OuterishProd/64/32                   8861      7815    +11.8%
BM_OuterishProd/128/1                  11363      6504    +42.8%
BM_OuterishProd/128/4                  11128      9794    +12.0%
BM_OuterishProd/128/64                 27691     27396     +1.1%
BM_OuterishProd/256/1                  33214     28123    +15.3%
BM_OuterishProd/256/4                  34312     36818     -7.3%
BM_OuterishProd/256/128               174866    176398     -0.9%
BM_OuterishProd/512/1                7963684    104224    +98.7%
BM_OuterishProd/512/4                7987913    112867    +98.6%
BM_OuterishProd/512/256              8198378   1306500    +84.1%
BM_OuterishProd/1k/1                 7356256    324432    +95.6%
BM_OuterishProd/1k/4                 8129616    331621    +95.9%
BM_OuterishProd/1k/512              27265418   7517538    +72.4%

Improvements in CPU time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    6169      1608    +73.9%
BM_OuterishProd/64/4                    7117      2412    +66.1%
BM_OuterishProd/64/32                  17702     15616    +11.8%
BM_OuterishProd/128/1                  45415      6498    +85.7%
BM_OuterishProd/128/4                  44459      9786    +78.0%
BM_OuterishProd/128/64                110657    109489     +1.1%
BM_OuterishProd/256/1                 265158     28101    +89.4%
BM_OuterishProd/256/4                 274234    183885    +32.9%
BM_OuterishProd/256/128              1397160   1408776     -0.8%
BM_OuterishProd/512/1               78947048    520703    +99.3%
BM_OuterishProd/512/4               86955578   1349742    +98.4%
BM_OuterishProd/512/256             74701613  15584661    +79.1%
BM_OuterishProd/1k/1                78352601   3877911    +95.1%
BM_OuterishProd/1k/4                78521643   3966221    +94.9%
BM_OuterishProd/1k/512              258104736  89480530    +65.3%
This commit is contained in:
Rasmus Munk Larsen 2016-10-06 10:33:10 -07:00
parent 80b5133789
commit 48c635e223
2 changed files with 38 additions and 31 deletions

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@ -24,7 +24,7 @@ template<
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
{ {
typedef gebp_traits<RhsScalar,LhsScalar> Traits; typedef gebp_traits<RhsScalar,LhsScalar> Traits;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run( static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth, Index rows, Index cols, Index depth,
@ -54,7 +54,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
{ {
typedef gebp_traits<LhsScalar,RhsScalar> Traits; typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth, static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride, const LhsScalar* _lhs, Index lhsStride,
@ -85,13 +85,13 @@ static void run(Index rows, Index cols, Index depth,
// this is the parallel version! // this is the parallel version!
Index tid = omp_get_thread_num(); Index tid = omp_get_thread_num();
Index threads = omp_get_num_threads(); Index threads = omp_get_num_threads();
LhsScalar* blockA = blocking.blockA(); LhsScalar* blockA = blocking.blockA();
eigen_internal_assert(blockA!=0); eigen_internal_assert(blockA!=0);
std::size_t sizeB = kc*nc; std::size_t sizeB = kc*nc;
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for(Index k=0; k<depth; k+=kc) for(Index k=0; k<depth; k+=kc)
{ {
@ -114,7 +114,7 @@ static void run(Index rows, Index cols, Index depth,
// Notify the other threads that the part A'_i is ready to go. // Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k; info[tid].sync = k;
// Computes C_i += A' * B' per A'_i // Computes C_i += A' * B' per A'_i
for(Index shift=0; shift<threads; ++shift) for(Index shift=0; shift<threads; ++shift)
{ {
@ -161,7 +161,7 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
@ -172,24 +172,24 @@ static void run(Index rows, Index cols, Index depth,
for(Index k2=0; k2<depth; k2+=kc) for(Index k2=0; k2<depth; k2+=kc)
{ {
const Index actual_kc = (std::min)(k2+kc,depth)-k2; const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's // Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number. // horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
// For each kc x nc block of the rhs's horizontal panel... // For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc) for(Index j2=0; j2<cols; j2+=nc)
{ {
const Index actual_nc = (std::min)(j2+nc,cols)-j2; const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching) // We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of // Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times). // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
if((!pack_rhs_once) || i2==0) if((!pack_rhs_once) || i2==0)
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel: // Everything is packed, we can now call the panel * block kernel:
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
} }
@ -229,7 +229,7 @@ struct gemm_functor
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
m_actualAlpha, m_blocking, info); m_actualAlpha, m_blocking, info);
} }
typedef typename Gemm::Traits Traits; typedef typename Gemm::Traits Traits;
protected: protected:
@ -313,7 +313,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
#endif #endif
} }
void initParallel(Index, Index, Index, Index) void initParallel(Index, Index, Index, Index)
{} {}
@ -359,14 +359,14 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
m_sizeA = this->m_mc * this->m_kc; m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc; m_sizeB = this->m_kc * this->m_nc;
} }
void initParallel(Index rows, Index cols, Index depth, Index num_threads) void initParallel(Index rows, Index cols, Index depth, Index num_threads)
{ {
this->m_mc = Transpose ? cols : rows; this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols; this->m_nc = Transpose ? rows : cols;
this->m_kc = depth; this->m_kc = depth;
eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
Index m = this->m_mc; Index m = this->m_mc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads); computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
m_sizeA = this->m_mc * this->m_kc; m_sizeA = this->m_mc * this->m_kc;
@ -401,7 +401,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
} // end namespace internal } // end namespace internal
namespace internal { namespace internal {
template<typename Lhs, typename Rhs> template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> > : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
@ -409,21 +409,21 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
typedef typename Product<Lhs,Rhs>::Scalar Scalar; typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef typename Lhs::Scalar LhsScalar; typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar; typedef typename Rhs::Scalar RhsScalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits; typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
typedef internal::blas_traits<Rhs> RhsBlasTraits; typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
enum { enum {
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
}; };
typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct; typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
template<typename Dst> template<typename Dst>
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
@ -453,7 +453,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
else else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
} }
template<typename Dest> template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
{ {
@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit); (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
} }
}; };

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_PARALLELIZER_H #ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
}; };
template<bool Condition, typename Functor, typename Index> template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
{ {
// TODO when EIGEN_USE_BLAS is defined, // TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types // we should still enable OMP for other scalar types
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary // the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole // fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway. // parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(depth);
EIGEN_UNUSED_VARIABLE(transpose); EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols); func(0,rows, 0,cols);
#else #else
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// FIXME this has to be fine tuned // FIXME this has to be fine tuned
Index size = transpose ? rows : cols; Index size = transpose ? rows : cols;
Index pb_max_threads = std::max<Index>(1,size / 32); Index pb_max_threads = std::max<Index>(1,size / 32);
// compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth);
double kMinTaskSize = 50000; // Heuristic.
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
// compute the number of threads we are going to use // compute the number of threads we are going to use
Index threads = std::min<Index>(nbThreads(), pb_max_threads); Index threads = std::min<Index>(nbThreads(), pb_max_threads);
@ -120,19 +127,19 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
if(transpose) if(transpose)
std::swap(rows,cols); std::swap(rows,cols);
ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0); ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
#pragma omp parallel num_threads(threads) #pragma omp parallel num_threads(threads)
{ {
Index i = omp_get_thread_num(); Index i = omp_get_thread_num();
// Note that the actual number of threads might be lower than the number of request ones. // Note that the actual number of threads might be lower than the number of request ones.
Index actual_threads = omp_get_num_threads(); Index actual_threads = omp_get_num_threads();
Index blockCols = (cols / actual_threads) & ~Index(0x3); Index blockCols = (cols / actual_threads) & ~Index(0x3);
Index blockRows = (rows / actual_threads); Index blockRows = (rows / actual_threads);
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
Index r0 = i*blockRows; Index r0 = i*blockRows;
Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;