diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 295dd8f25..39b283a3f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -373,8 +373,8 @@ class ei_gemm_blocking_spacem_blockB==0) - this->m_blockB = ei_aligned_new(m_sizeB); + if(this->m_blockW==0) + this->m_blockW = ei_aligned_new(m_sizeW); } void allocateAll() @@ -432,7 +432,7 @@ class GeneralProduct BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); - ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols()); + ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } }; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index e27c8edab..f307812bf 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -81,7 +81,7 @@ template struct GemmParallelInfo }; template -void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) +void ei_parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) { #ifndef EIGEN_HAS_OPENMP func(0,rows, 0,cols); @@ -98,9 +98,11 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) if((!Condition) || (omp_get_num_threads()>1)) return func(0,rows, 0,cols); + Index size = transpose ? cols : rows; + // 2- compute the maximal number of threads from the size of the product: // FIXME this has to be fine tuned - Index max_threads = std::max(1,rows / 32); + Index max_threads = std::max(1,size / 32); // 3 - compute the number of threads we are going to use Index threads = std::min(nbThreads(), max_threads); @@ -110,6 +112,9 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) func.initParallelSession(); + if(transpose) + std::swap(rows,cols); + Index blockCols = (cols / threads) & ~Index(0x3); Index blockRows = (rows / threads) & ~Index(0x7); @@ -127,7 +132,10 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) info[i].rhs_start = c0; info[i].rhs_length = actualBlockCols; - func(r0, actualBlockRows, 0,cols, info); + if(transpose) + func(0, cols, r0, actualBlockRows, info); + else + func(r0, actualBlockRows, 0,cols, info); } delete[] info;