From 1d75fab368302a12fd1ff82003ed6a282ecab2b1 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Sat, 2 Oct 2021 14:58:23 +0000
Subject: [PATCH] Speed up tensor reduction

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 132 +++++++++++-------
 1 file changed, 81 insertions(+), 51 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index aef2b4341..f13ee48e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -21,7 +21,6 @@
 #endif
 #endif
 
-
 #include "./InternalHeaderCheck.h"
 
 namespace Eigen {
@@ -168,8 +167,8 @@ struct GenericDimReducer<-1, Self, Op> {
 };
 
 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
-          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
-                                   !Self::ReducerTraits::IsExactlyAssociative)>
+    bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+                             !Self::ReducerTraits::IsExactlyAssociative)>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -182,22 +181,52 @@ struct InnerMostDimReducer {
 
 template <typename Self, typename Op>
 struct InnerMostDimReducer<Self, Op, true, false> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
-    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
-    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
+    using Index = typename Self::Index;
+    constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    Index start = 0;
+    typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+    if (numValuesToReduce >= 4*packetSize) {
+      const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize);
+      typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      const Index offset0 = firstIndex;
+      const Index offset1 = firstIndex + packetSize;
+      const Index offset2 = firstIndex + 2*packetSize;
+      const Index offset3 = firstIndex + 3*packetSize;
+      for (Index j = 0; j < VectorizedSize4; j += 4*packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
+      }
+      reducer0.reducePacket(paccum1, &paccum0);
+      reducer0.reducePacket(paccum2, &paccum0);
+      reducer0.reducePacket(paccum3, &paccum0);
+      start = VectorizedSize4;
     }
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
-      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    if (start <= (numValuesToReduce - packetSize)) {
+      const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+      for (Index j = start; j < VectorizedSize; j += packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
+      }
+      start = VectorizedSize;
     }
-    return reducer.finalizeBoth(accum, paccum);
+    typename Self::CoeffReturnType accum = reducer0.initialize();
+    for (Index j = start; j < numValuesToReduce; ++j) {
+      reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer0.finalizeBoth(accum, paccum0);
   }
 };
 
-#if !defined(EIGEN_HIPCC) 
+
+#if !defined(EIGEN_HIPCC)
+
+// The following implements tree-based reduction, which improves the accuracy
+// of sum and mean reductions, since each of the n inputs only participates in
+// O(log n) additions.
 static const int kLeafSize = 1024;
 
 template <typename Self, typename Op>
@@ -208,16 +237,15 @@ struct InnerMostDimReducer<Self, Op, false, true> {
     typename Self::CoeffReturnType accum = reducer.initialize();
     if (numValuesToReduce > kLeafSize) {
       const typename Self::Index half = numValuesToReduce / 2;
+      // Recursively reduce the two halves.
       reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
       reducer.reduce(
           reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
           &accum);
+      return reducer.finalize(accum);
     } else {
-      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
+      return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
     }
-    return reducer.finalize(accum);
   }
 };
 
@@ -244,36 +272,12 @@ struct InnerMostDimReducer<Self, Op, true, true> {
       }
       return reducer.finalize(accum);
     } else {
-      const typename Self::Index UnrollSize =
-          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
-      const typename Self::Index VectorizedSize =
-          (numValuesToReduce / packetSize) * packetSize;
-      typename Self::PacketReturnType paccum =
-          reducer.template initializePacket<typename Self::PacketReturnType>();
-      typename Self::PacketReturnType paccum2 =
-          reducer.template initializePacket<typename Self::PacketReturnType>();
-      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
-        reducer.reducePacket(
-            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
-        reducer.reducePacket(
-            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
-            &paccum2);
-      }
-      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
-        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
-                                 firstIndex + j), &paccum);
-      }
-      reducer.reducePacket(paccum2, &paccum);
-      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
-           ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
-      return reducer.finalizeBoth(accum, paccum);
+      return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
     }
   }
 };
 #endif
- 
+
 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct InnerMostDimPreserver {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
@@ -294,10 +298,37 @@ struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
 
 template <typename Self, typename Op>
 struct InnerMostDimPreserver<0, Self, Op, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
-      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer0, typename Self::PacketReturnType* accum0) {
+    using Index = typename Self::Index;
+    const Index stride = self.m_reducedStrides[0];
+    const Index size = self.m_reducedDims[0];
+    if (size >= 16) {
+      const Index unrolled_size4 = (size / 4) * 4;
+      typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      for (Index j = 0; j < unrolled_size4; j += 4) {
+        const Index input0 = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
+        const Index input1 = firstIndex + (j+1) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
+        const Index input2 = firstIndex + (j+2) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
+        const Index input3 = firstIndex + (j+3) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
+      }
+      reducer0.reducePacket(accum1, accum0);
+      reducer0.reducePacket(accum2, accum0);
+      reducer0.reducePacket(accum3, accum0);
+      for (Index j = unrolled_size4; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
+    } else {
+      for (Index j = 0; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
     }
   }
 };
@@ -353,15 +384,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
         self.m_impl.costPerCoeff(Vectorizable) +
         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
                      PacketSize);
-    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+    const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         num_coeffs, cost, device.numThreads());
     if (num_threads == 1) {
       *output =
           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
       return;
     }
-    const Index blocksize =
-        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+    const Index blocksize = num_coeffs / num_threads;
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);