mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Merged in rmlarsen/eigen2 (pull request PR-232)
Improve performance of parallelized matrix multiply for rectangular matrices
This commit is contained in:
commit
050c681bdd
@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
||||
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
|
||||
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
|
||||
};
|
||||
|
||||
template<bool Condition, typename Functor, typename Index>
|
||||
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
|
||||
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
|
||||
{
|
||||
// TODO when EIGEN_USE_BLAS is defined,
|
||||
// we should still enable OMP for other scalar types
|
||||
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
||||
// the matrix product when multithreading is enabled. This is a temporary
|
||||
// fix to support row-major destination matrices. This whole
|
||||
// parallelizer mechanism has to be redisigned anyway.
|
||||
EIGEN_UNUSED_VARIABLE(depth);
|
||||
EIGEN_UNUSED_VARIABLE(transpose);
|
||||
func(0,rows, 0,cols);
|
||||
#else
|
||||
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
||||
// FIXME this has to be fine tuned
|
||||
Index size = transpose ? rows : cols;
|
||||
Index pb_max_threads = std::max<Index>(1,size / 32);
|
||||
// compute the maximal number of threads from the total amount of work:
|
||||
double work = static_cast<double>(rows) * static_cast<double>(cols) *
|
||||
static_cast<double>(depth);
|
||||
double kMinTaskSize = 50000; // Heuristic.
|
||||
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
|
||||
|
||||
// compute the number of threads we are going to use
|
||||
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user