mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 11:49:02 +08:00
Merged in rmlarsen/eigen2 (pull request PR-232)
Improve performance of parallelized matrix multiply for rectangular matrices
This commit is contained in:
commit
050c681bdd
@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
|
|
||||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
||||||
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
|
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<bool Condition, typename Functor, typename Index>
|
template<bool Condition, typename Functor, typename Index>
|
||||||
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
|
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
|
||||||
{
|
{
|
||||||
// TODO when EIGEN_USE_BLAS is defined,
|
// TODO when EIGEN_USE_BLAS is defined,
|
||||||
// we should still enable OMP for other scalar types
|
// we should still enable OMP for other scalar types
|
||||||
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
// the matrix product when multithreading is enabled. This is a temporary
|
// the matrix product when multithreading is enabled. This is a temporary
|
||||||
// fix to support row-major destination matrices. This whole
|
// fix to support row-major destination matrices. This whole
|
||||||
// parallelizer mechanism has to be redisigned anyway.
|
// parallelizer mechanism has to be redisigned anyway.
|
||||||
|
EIGEN_UNUSED_VARIABLE(depth);
|
||||||
EIGEN_UNUSED_VARIABLE(transpose);
|
EIGEN_UNUSED_VARIABLE(transpose);
|
||||||
func(0,rows, 0,cols);
|
func(0,rows, 0,cols);
|
||||||
#else
|
#else
|
||||||
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
// FIXME this has to be fine tuned
|
// FIXME this has to be fine tuned
|
||||||
Index size = transpose ? rows : cols;
|
Index size = transpose ? rows : cols;
|
||||||
Index pb_max_threads = std::max<Index>(1,size / 32);
|
Index pb_max_threads = std::max<Index>(1,size / 32);
|
||||||
|
// compute the maximal number of threads from the total amount of work:
|
||||||
|
double work = static_cast<double>(rows) * static_cast<double>(cols) *
|
||||||
|
static_cast<double>(depth);
|
||||||
|
double kMinTaskSize = 50000; // Heuristic.
|
||||||
|
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
|
||||||
|
|
||||||
// compute the number of threads we are going to use
|
// compute the number of threads we are going to use
|
||||||
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user