Merged in rmlarsen/eigen2 (pull request PR-232)

Improve performance of parallelized matrix multiply for rectangular matrices
2025-09-25 15:53:19 +08:00 · 2016-10-14 14:51:09 +00:00 · 2016-10-14 14:51:09 +00:00 · 050c681bdd
commit 050c681bdd
parent 737e4152c3 48c635e223
2 changed files with 38 additions and 31 deletions
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>

    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
-                              (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
+        (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
  }
 };

--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
 };

 template<bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
+void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
 {
  // TODO when EIGEN_USE_BLAS is defined,
  // we should still enable OMP for other scalar types
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  // the matrix product when multithreading is enabled. This is a temporary
  // fix to support row-major destination matrices. This whole
  // parallelizer mechanism has to be redisigned anyway.
+  EIGEN_UNUSED_VARIABLE(depth);
  EIGEN_UNUSED_VARIABLE(transpose);
  func(0,rows, 0,cols);
 #else
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  // FIXME this has to be fine tuned
  Index size = transpose ? rows : cols;
  Index pb_max_threads = std::max<Index>(1,size / 32);
+  // compute the maximal number of threads from the total amount of work:
+  double work = static_cast<double>(rows) * static_cast<double>(cols) *
+      static_cast<double>(depth);
+  double kMinTaskSize = 50000;  // Heuristic.
+  max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
+
  // compute the number of threads we are going to use
  Index threads = std::min<Index>(nbThreads(), pb_max_threads);