From 3e4a68cc60197b75e4bc9715f6752906c420c50b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 12 Nov 2011 09:19:48 +0100 Subject: [PATCH] optimize vectorized reductions by peeling the loop: - x2 for squaredNorm() on double - peeling the loop with a peeling factor of 4 leads to even better perf for large vectors (e.g., >64) but it makes more difficult to keep good performance on smaller ones. --- Eigen/src/Core/Redux.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index f9f5a95d5..f25dd2eeb 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -219,15 +219,28 @@ struct redux_impl alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit) ? Aligned : Unaligned }; - const Index alignedSize = ((size-alignedStart)/packetSize)*packetSize; - const Index alignedEnd = alignedStart + alignedSize; + const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); + const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); + const Index alignedEnd2 = alignedStart + alignedSize2; + const Index alignedEnd = alignedStart + alignedSize; Scalar res; if(alignedSize) { - PacketScalar packet_res = mat.template packet(alignedStart); - for(Index index = alignedStart + packetSize; index < alignedEnd; index += packetSize) - packet_res = func.packetOp(packet_res, mat.template packet(index)); - res = func.predux(packet_res); + PacketScalar packet_res0 = mat.template packet(alignedStart); + if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop + { + PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) + { + packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); + packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + } + + packet_res0 = func.packetOp(packet_res0,packet_res1); + if(alignedEnd>alignedEnd2) + packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + } + res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) res = func(res,mat.coeff(index));