From a2f8eba026539f6690e1ee227db8a6a2808b29a4 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 24 Feb 2024 19:13:33 +0000 Subject: [PATCH] Speed up sparse x dense dot product. --- Eigen/src/SparseCore/SparseDot.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h index aa876ecbc..f040915b2 100644 --- a/Eigen/src/SparseCore/SparseDot.h +++ b/Eigen/src/SparseCore/SparseDot.h @@ -17,7 +17,8 @@ namespace Eigen { template template -typename internal::traits::Scalar SparseMatrixBase::dot(const MatrixBase& other) const { +inline typename internal::traits::Scalar SparseMatrixBase::dot( + const MatrixBase& other) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived) @@ -30,17 +31,23 @@ typename internal::traits::Scalar SparseMatrixBase::dot(const internal::evaluator thisEval(derived()); typename internal::evaluator::InnerIterator i(thisEval, 0); - Scalar res(0); - while (i) { - res += numext::conj(i.value()) * other.coeff(i.index()); + // Two accumulators, which breaks the dependency chain on the accumulator + // and allows more instruction-level parallelism in the following loop. + Scalar res1(0); + Scalar res2(0); + for (; i; ++i) { + res1 += numext::conj(i.value()) * other.coeff(i.index()); ++i; + if (i) { + res2 += numext::conj(i.value()) * other.coeff(i.index()); + } } - return res; + return res1 + res2; } template template -typename internal::traits::Scalar SparseMatrixBase::dot( +inline typename internal::traits::Scalar SparseMatrixBase::dot( const SparseMatrixBase& other) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)