Strongly hint but don't force the compiler to unroll a some loops in the tensor executor. This results in up to 27% faster code.

2025-07-25 14:24:31 +08:00 · 2016-05-05 09:25:55 -07:00 · 2016-05-05 09:25:55 -07:00 · a4d6e8fef0
commit a4d6e8fef0
parent 7875437ca0
1 changed files with 12 additions and 10 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
    {
      const Index size = array_prod(evaluator.dimensions());
      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
      }
      const Index VectorizedSize = (size / PacketSize) * PacketSize;
      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@ -104,12 +105,13 @@ struct EvalRange<Evaluator, Index, true> {
    if (last - first >= PacketSize) {
      eigen_assert(first % PacketSize == 0);
      Index last_chunk_offset = last - 4 * PacketSize;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
      }
      last_chunk_offset = last - PacketSize;
      for (; i <= last_chunk_offset; i += PacketSize) {