Speed up tensor reduction

This commit is contained in:
Rasmus Munk Larsen 2021-10-02 14:58:23 +00:00 committed by Antonio Sánchez
parent be9e7d205f
commit 1d75fab368

View File

@ -21,7 +21,6 @@
#endif
#endif
#include "./InternalHeaderCheck.h"
namespace Eigen {
@ -168,8 +167,8 @@ struct GenericDimReducer<-1, Self, Op> {
};
template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
!Self::ReducerTraits::IsExactlyAssociative)>
bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
!Self::ReducerTraits::IsExactlyAssociative)>
struct InnerMostDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
typename Self::CoeffReturnType accum = reducer.initialize();
@ -182,22 +181,52 @@ struct InnerMostDimReducer {
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true, false> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
using Index = typename Self::Index;
constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
Index start = 0;
typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
if (numValuesToReduce >= 4*packetSize) {
const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize);
typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
const Index offset0 = firstIndex;
const Index offset1 = firstIndex + packetSize;
const Index offset2 = firstIndex + 2*packetSize;
const Index offset3 = firstIndex + 3*packetSize;
for (Index j = 0; j < VectorizedSize4; j += 4*packetSize) {
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
}
reducer0.reducePacket(paccum1, &paccum0);
reducer0.reducePacket(paccum2, &paccum0);
reducer0.reducePacket(paccum3, &paccum0);
start = VectorizedSize4;
}
typename Self::CoeffReturnType accum = reducer.initialize();
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
if (start <= (numValuesToReduce - packetSize)) {
const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
for (Index j = start; j < VectorizedSize; j += packetSize) {
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
}
start = VectorizedSize;
}
return reducer.finalizeBoth(accum, paccum);
typename Self::CoeffReturnType accum = reducer0.initialize();
for (Index j = start; j < numValuesToReduce; ++j) {
reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer0.finalizeBoth(accum, paccum0);
}
};
#if !defined(EIGEN_HIPCC)
#if !defined(EIGEN_HIPCC)
// The following implements tree-based reduction, which improves the accuracy
// of sum and mean reductions, since each of the n inputs only participates in
// O(log n) additions.
static const int kLeafSize = 1024;
template <typename Self, typename Op>
@ -208,16 +237,15 @@ struct InnerMostDimReducer<Self, Op, false, true> {
typename Self::CoeffReturnType accum = reducer.initialize();
if (numValuesToReduce > kLeafSize) {
const typename Self::Index half = numValuesToReduce / 2;
// Recursively reduce the two halves.
reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
reducer.reduce(
reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
&accum);
return reducer.finalize(accum);
} else {
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
}
return reducer.finalize(accum);
}
};
@ -244,36 +272,12 @@ struct InnerMostDimReducer<Self, Op, true, true> {
}
return reducer.finalize(accum);
} else {
const typename Self::Index UnrollSize =
(numValuesToReduce / (2*packetSize)) * 2*packetSize;
const typename Self::Index VectorizedSize =
(numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum =
reducer.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType paccum2 =
reducer.template initializePacket<typename Self::PacketReturnType>();
for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
&paccum2);
}
for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
reducer.reducePacket(self.m_impl.template packet<Unaligned>(
firstIndex + j), &paccum);
}
reducer.reducePacket(paccum2, &paccum);
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalizeBoth(accum, paccum);
return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
}
}
};
#endif
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
@ -294,10 +298,37 @@ struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
template <typename Self, typename Op>
struct InnerMostDimPreserver<0, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer0, typename Self::PacketReturnType* accum0) {
using Index = typename Self::Index;
const Index stride = self.m_reducedStrides[0];
const Index size = self.m_reducedDims[0];
if (size >= 16) {
const Index unrolled_size4 = (size / 4) * 4;
typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
for (Index j = 0; j < unrolled_size4; j += 4) {
const Index input0 = firstIndex + j * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
const Index input1 = firstIndex + (j+1) * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
const Index input2 = firstIndex + (j+2) * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
const Index input3 = firstIndex + (j+3) * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
}
reducer0.reducePacket(accum1, accum0);
reducer0.reducePacket(accum2, accum0);
reducer0.reducePacket(accum3, accum0);
for (Index j = unrolled_size4; j < size; ++j) {
Index input = firstIndex + j * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
}
} else {
for (Index j = 0; j < size; ++j) {
Index input = firstIndex + j * stride;
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
}
}
}
};
@ -353,15 +384,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
if (num_threads == 1) {
*output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
return;
}
const Index blocksize =
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const Index blocksize = num_coeffs / num_threads;
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);