From 1d75fab368302a12fd1ff82003ed6a282ecab2b1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 2 Oct 2021 14:58:23 +0000 Subject: [PATCH] Speed up tensor reduction --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 132 +++++++++++------- 1 file changed, 81 insertions(+), 51 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index aef2b4341..f13ee48e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -21,7 +21,6 @@ #endif #endif - #include "./InternalHeaderCheck.h" namespace Eigen { @@ -168,8 +167,8 @@ struct GenericDimReducer<-1, Self, Op> { }; template + bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && + !Self::ReducerTraits::IsExactlyAssociative)> struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -182,22 +181,52 @@ struct InnerMostDimReducer { template struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const typename Self::Index packetSize = internal::unpacket_traits::size; - const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = reducer.template initializePacket(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) { + using Index = typename Self::Index; + constexpr Index packetSize = internal::unpacket_traits::size; + Index start = 0; + typename Self::PacketReturnType paccum0 = reducer0.template initializePacket(); + if (numValuesToReduce >= 4*packetSize) { + const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize); + typename Self::PacketReturnType paccum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum3 = reducer0.template initializePacket(); + const Index offset0 = firstIndex; + const Index offset1 = firstIndex + packetSize; + const Index offset2 = firstIndex + 2*packetSize; + const Index offset3 = firstIndex + 3*packetSize; + for (Index j = 0; j < VectorizedSize4; j += 4*packetSize) { + reducer0.reducePacket(self.m_impl.template packet(offset0 + j), &paccum0); + reducer0.reducePacket(self.m_impl.template packet(offset1 + j), &paccum1); + reducer0.reducePacket(self.m_impl.template packet(offset2 + j), &paccum2); + reducer0.reducePacket(self.m_impl.template packet(offset3 + j), &paccum3); + } + reducer0.reducePacket(paccum1, &paccum0); + reducer0.reducePacket(paccum2, &paccum0); + reducer0.reducePacket(paccum3, &paccum0); + start = VectorizedSize4; } - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + if (start <= (numValuesToReduce - packetSize)) { + const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + for (Index j = start; j < VectorizedSize; j += packetSize) { + reducer0.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum0); + } + start = VectorizedSize; } - return reducer.finalizeBoth(accum, paccum); + typename Self::CoeffReturnType accum = reducer0.initialize(); + for (Index j = start; j < numValuesToReduce; ++j) { + reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer0.finalizeBoth(accum, paccum0); } }; -#if !defined(EIGEN_HIPCC) + +#if !defined(EIGEN_HIPCC) + +// The following implements tree-based reduction, which improves the accuracy +// of sum and mean reductions, since each of the n inputs only participates in +// O(log n) additions. static const int kLeafSize = 1024; template @@ -208,16 +237,15 @@ struct InnerMostDimReducer { typename Self::CoeffReturnType accum = reducer.initialize(); if (numValuesToReduce > kLeafSize) { const typename Self::Index half = numValuesToReduce / 2; + // Recursively reduce the two halves. reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); reducer.reduce( reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum); + return reducer.finalize(accum); } else { - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); } - return reducer.finalize(accum); } }; @@ -244,36 +272,12 @@ struct InnerMostDimReducer { } return reducer.finalize(accum); } else { - const typename Self::Index UnrollSize = - (numValuesToReduce / (2*packetSize)) * 2*packetSize; - const typename Self::Index VectorizedSize = - (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = - reducer.template initializePacket(); - typename Self::PacketReturnType paccum2 = - reducer.template initializePacket(); - for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { - reducer.reducePacket( - self.m_impl.template packet(firstIndex + j), &paccum); - reducer.reducePacket( - self.m_impl.template packet(firstIndex + j + packetSize), - &paccum2); - } - for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { - reducer.reducePacket(self.m_impl.template packet( - firstIndex + j), &paccum); - } - reducer.reducePacket(paccum2, &paccum); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; - ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, paccum); + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); } } }; #endif - + template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { @@ -294,10 +298,37 @@ struct InnerMostDimPreserver { template struct InnerMostDimPreserver<0, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reducePacket(self.m_impl.template packet(input), accum); + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer0, typename Self::PacketReturnType* accum0) { + using Index = typename Self::Index; + const Index stride = self.m_reducedStrides[0]; + const Index size = self.m_reducedDims[0]; + if (size >= 16) { + const Index unrolled_size4 = (size / 4) * 4; + typename Self::PacketReturnType accum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum3 = reducer0.template initializePacket(); + for (Index j = 0; j < unrolled_size4; j += 4) { + const Index input0 = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input0), accum0); + const Index input1 = firstIndex + (j+1) * stride; + reducer0.reducePacket(self.m_impl.template packet(input1), &accum1); + const Index input2 = firstIndex + (j+2) * stride; + reducer0.reducePacket(self.m_impl.template packet(input2), &accum2); + const Index input3 = firstIndex + (j+3) * stride; + reducer0.reducePacket(self.m_impl.template packet(input3), &accum3); + } + reducer0.reducePacket(accum1, accum0); + reducer0.reducePacket(accum2, accum0); + reducer0.reducePacket(accum3, accum0); + for (Index j = unrolled_size4; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } + } else { + for (Index j = 0; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } } } }; @@ -353,15 +384,14 @@ struct FullReducer { self.m_impl.costPerCoeff(Vectorizable) + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, PacketSize); - const int num_threads = TensorCostModel::numThreads( + const Index num_threads = TensorCostModel::numThreads( num_coeffs, cost, device.numThreads()); if (num_threads == 1) { *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; } - const Index blocksize = - std::floor(static_cast(num_coeffs) / num_threads); + const Index blocksize = num_coeffs / num_threads; const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize);