quick temporary fix for a perf issue we just identified with

vectorization.... now the sum benchmark runs 3x faster with vectorization than without.
2025-10-20 03:51:06 +08:00 · 2008-06-23 11:23:05 +00:00 · 2008-06-23 11:23:05 +00:00 · 03d19f3bae
commit 03d19f3bae
parent 32596c5e9e
1 changed files with 20 additions and 5 deletions
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@ -194,9 +194,22 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
    // do the vectorizable part of the sum
    if(size >= packetSize)
    {
+    asm("#begin");
+
      PacketScalar packet_res;
      packet_res = mat.template packet<Aligned>(0, 0);
      int index;
+      if(Derived::IsVectorAtCompileTime)
+      {
+        for(index = packetSize; index<alignedSize ; index+=packetSize)
+        {
+          const int row = Derived::RowsAtCompileTime==1 ? 0 : index;
+          const int col = Derived::RowsAtCompileTime==1 ? index : 0;
+          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
+        }
+      }
+      else
+      {
        for(index = packetSize; index<alignedSize ; index+=packetSize)
        {
          // FIXME the following is not really efficient
@ -204,7 +217,9 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
          const int col = rowMajor ? index%innerSize : index/innerSize;
          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
        }
+      }
      res = ei_predux(packet_res);
+    asm("#end");

      // now we must do the rest without vectorization.
      if(alignedSize == size) return res;