From b3bea43a2da484d420e20c615cb5c9e3c04024e5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 28 Oct 2021 23:52:54 +0000 Subject: [PATCH] Don't use unrolled loops for stateful reducers. The problem is the combination step, e.g. reducer0.reducePacket(accum1, accum0); reducer0.reducePacket(accum2, accum0); reducer0.reducePacket(accum3, accum0); For the mean reducer this will increment the count as well as adding together the accumulators and result in the wrong count being divided into the sum at the end. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 825d33512..b5922477a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -190,7 +190,7 @@ struct InnerMostDimReducer { constexpr Index packetSize = internal::unpacket_traits::size; Index start = 0; typename Self::PacketReturnType paccum0 = reducer0.template initializePacket(); - if (numValuesToReduce >= 4*packetSize) { + if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4*packetSize) { const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize); typename Self::PacketReturnType paccum1 = reducer0.template initializePacket(); typename Self::PacketReturnType paccum2 = reducer0.template initializePacket(); @@ -313,7 +313,7 @@ struct InnerMostDimPreserver<0, Self, Op, true> { using Index = typename Self::Index; const Index stride = self.m_reducedStrides[0]; const Index size = self.m_reducedDims[0]; - if (size >= 16) { + if (!Self::ReducerTraits::IsStateful && size >= 16) { const Index unrolled_size4 = (size / 4) * 4; typename Self::PacketReturnType accum1 = reducer0.template initializePacket(); typename Self::PacketReturnType accum2 = reducer0.template initializePacket();