Don't use unrolled loops for stateful reducers. The problem is the combination step, e.g.

reducer0.reducePacket(accum1, accum0);
reducer0.reducePacket(accum2, accum0);
reducer0.reducePacket(accum3, accum0);

For the mean reducer this will increment the count as well as adding together the accumulators and result in the wrong count being divided into the sum at the end.
This commit is contained in:
Rasmus Munk Larsen 2021-10-28 23:52:54 +00:00
parent 9cf34ee0ae
commit b3bea43a2d

View File

@ -190,7 +190,7 @@ struct InnerMostDimReducer<Self, Op, true, false> {
constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
Index start = 0; Index start = 0;
typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>(); typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
if (numValuesToReduce >= 4*packetSize) { if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4*packetSize) {
const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize); const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize);
typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>(); typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>(); typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
@ -313,7 +313,7 @@ struct InnerMostDimPreserver<0, Self, Op, true> {
using Index = typename Self::Index; using Index = typename Self::Index;
const Index stride = self.m_reducedStrides[0]; const Index stride = self.m_reducedStrides[0];
const Index size = self.m_reducedDims[0]; const Index size = self.m_reducedDims[0];
if (size >= 16) { if (!Self::ReducerTraits::IsStateful && size >= 16) {
const Index unrolled_size4 = (size / 4) * 4; const Index unrolled_size4 = (size / 4) * 4;
typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>(); typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>(); typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();