Merged in rmlarsen/eigen2 (pull request PR-232)

Improve performance of parallelized matrix multiply for rectangular matrices
This commit is contained in:
Gael Guennebaud 2016-10-14 14:51:09 +00:00
commit 050c681bdd
2 changed files with 38 additions and 31 deletions

View File

@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
}
};

View File

@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
};
template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(depth);
EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols);
#else
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// FIXME this has to be fine tuned
Index size = transpose ? rows : cols;
Index pb_max_threads = std::max<Index>(1,size / 32);
// compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth);
double kMinTaskSize = 50000; // Heuristic.
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
// compute the number of threads we are going to use
Index threads = std::min<Index>(nbThreads(), pb_max_threads);