Small speed-up in row-major sparse dense product

2025-06-04 18:54:00 +08:00 · 2021-12-15 18:46:25 +00:00 · 2021-12-15 18:46:25 +00:00 · e939c06b0e
commit e939c06b0e
parent 2d39da8af5
1 changed files with 12 additions and 4 deletions
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@ -65,10 +65,18 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
  
  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
  {
-    typename Res::Scalar tmp(0);
-    for(LhsInnerIterator it(lhsEval,i); it ;++it)
-      tmp += it.value() * rhs.coeff(it.index(),col);
-    res.coeffRef(i,col) += alpha * tmp;
+    // Two accumulators, which breaks the dependency chain on the accumulator
+    // and allows more instruction-level parallelism in the following loop
+    typename Res::Scalar tmp_a(0);
+    typename Res::Scalar tmp_b(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it) {
+      tmp_a += it.value() * rhs.coeff(it.index(), col);
+      ++it;
+      if(it) {
+        tmp_b += it.value() * rhs.coeff(it.index(), col);
+      }
+    }
+    res.coeffRef(i, col) += alpha * (tmp_a + tmp_b);
  }
  
 };