Add a simple cost model to prevent Eigen's parallel GEMM from using too many threads when the inner dimension is small.

Timing for square matrices is unchanged, but both CPU and Wall time are significantly improved for skinny matrices. The benchmarks below are for multiplying NxK * KxN matrices with test names of the form BM_OuterishProd/N/K.

Improvements in Wall time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    3088      1610    +47.9%
BM_OuterishProd/64/4                    3562      2414    +32.2%
BM_OuterishProd/64/32                   8861      7815    +11.8%
BM_OuterishProd/128/1                  11363      6504    +42.8%
BM_OuterishProd/128/4                  11128      9794    +12.0%
BM_OuterishProd/128/64                 27691     27396     +1.1%
BM_OuterishProd/256/1                  33214     28123    +15.3%
BM_OuterishProd/256/4                  34312     36818     -7.3%
BM_OuterishProd/256/128               174866    176398     -0.9%
BM_OuterishProd/512/1                7963684    104224    +98.7%
BM_OuterishProd/512/4                7987913    112867    +98.6%
BM_OuterishProd/512/256              8198378   1306500    +84.1%
BM_OuterishProd/1k/1                 7356256    324432    +95.6%
BM_OuterishProd/1k/4                 8129616    331621    +95.9%
BM_OuterishProd/1k/512              27265418   7517538    +72.4%

Improvements in CPU time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    6169      1608    +73.9%
BM_OuterishProd/64/4                    7117      2412    +66.1%
BM_OuterishProd/64/32                  17702     15616    +11.8%
BM_OuterishProd/128/1                  45415      6498    +85.7%
BM_OuterishProd/128/4                  44459      9786    +78.0%
BM_OuterishProd/128/64                110657    109489     +1.1%
BM_OuterishProd/256/1                 265158     28101    +89.4%
BM_OuterishProd/256/4                 274234    183885    +32.9%
BM_OuterishProd/256/128              1397160   1408776     -0.8%
BM_OuterishProd/512/1               78947048    520703    +99.3%
BM_OuterishProd/512/4               86955578   1349742    +98.4%
BM_OuterishProd/512/256             74701613  15584661    +79.1%
BM_OuterishProd/1k/1                78352601   3877911    +95.1%
BM_OuterishProd/1k/4                78521643   3966221    +94.9%
BM_OuterishProd/1k/512              258104736  89480530    +65.3%
This commit is contained in:
Rasmus Munk Larsen 2016-10-06 10:33:10 -07:00
parent 80b5133789
commit 48c635e223
2 changed files with 38 additions and 31 deletions

View File

@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
}
};

View File

@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
};
template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(depth);
EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols);
#else
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// FIXME this has to be fine tuned
Index size = transpose ? rows : cols;
Index pb_max_threads = std::max<Index>(1,size / 32);
// compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth);
double kMinTaskSize = 50000; // Heuristic.
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
// compute the number of threads we are going to use
Index threads = std::min<Index>(nbThreads(), pb_max_threads);