mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Speed up tensor reduction
This commit is contained in:
parent
be9e7d205f
commit
1d75fab368
@ -21,7 +21,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
@ -182,22 +181,52 @@ struct InnerMostDimReducer {
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerMostDimReducer<Self, Op, true, false> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
|
||||
const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
|
||||
typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
|
||||
reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
|
||||
using Index = typename Self::Index;
|
||||
constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
Index start = 0;
|
||||
typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
if (numValuesToReduce >= 4*packetSize) {
|
||||
const Index VectorizedSize4 = (numValuesToReduce / (4*packetSize)) * (4*packetSize);
|
||||
typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
const Index offset0 = firstIndex;
|
||||
const Index offset1 = firstIndex + packetSize;
|
||||
const Index offset2 = firstIndex + 2*packetSize;
|
||||
const Index offset3 = firstIndex + 3*packetSize;
|
||||
for (Index j = 0; j < VectorizedSize4; j += 4*packetSize) {
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
|
||||
}
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
|
||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
reducer0.reducePacket(paccum1, &paccum0);
|
||||
reducer0.reducePacket(paccum2, &paccum0);
|
||||
reducer0.reducePacket(paccum3, &paccum0);
|
||||
start = VectorizedSize4;
|
||||
}
|
||||
return reducer.finalizeBoth(accum, paccum);
|
||||
if (start <= (numValuesToReduce - packetSize)) {
|
||||
const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
|
||||
for (Index j = start; j < VectorizedSize; j += packetSize) {
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
|
||||
}
|
||||
start = VectorizedSize;
|
||||
}
|
||||
typename Self::CoeffReturnType accum = reducer0.initialize();
|
||||
for (Index j = start; j < numValuesToReduce; ++j) {
|
||||
reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
}
|
||||
return reducer0.finalizeBoth(accum, paccum0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
|
||||
// The following implements tree-based reduction, which improves the accuracy
|
||||
// of sum and mean reductions, since each of the n inputs only participates in
|
||||
// O(log n) additions.
|
||||
static const int kLeafSize = 1024;
|
||||
|
||||
template <typename Self, typename Op>
|
||||
@ -208,16 +237,15 @@ struct InnerMostDimReducer<Self, Op, false, true> {
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
if (numValuesToReduce > kLeafSize) {
|
||||
const typename Self::Index half = numValuesToReduce / 2;
|
||||
// Recursively reduce the two halves.
|
||||
reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
|
||||
reducer.reduce(
|
||||
reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
|
||||
&accum);
|
||||
} else {
|
||||
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
|
||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
}
|
||||
}
|
||||
return reducer.finalize(accum);
|
||||
} else {
|
||||
return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -244,31 +272,7 @@ struct InnerMostDimReducer<Self, Op, true, true> {
|
||||
}
|
||||
return reducer.finalize(accum);
|
||||
} else {
|
||||
const typename Self::Index UnrollSize =
|
||||
(numValuesToReduce / (2*packetSize)) * 2*packetSize;
|
||||
const typename Self::Index VectorizedSize =
|
||||
(numValuesToReduce / packetSize) * packetSize;
|
||||
typename Self::PacketReturnType paccum =
|
||||
reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
typename Self::PacketReturnType paccum2 =
|
||||
reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
|
||||
reducer.reducePacket(
|
||||
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
|
||||
reducer.reducePacket(
|
||||
self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
|
||||
&paccum2);
|
||||
}
|
||||
for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
|
||||
reducer.reducePacket(self.m_impl.template packet<Unaligned>(
|
||||
firstIndex + j), &paccum);
|
||||
}
|
||||
reducer.reducePacket(paccum2, &paccum);
|
||||
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
|
||||
++j) {
|
||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
}
|
||||
return reducer.finalizeBoth(accum, paccum);
|
||||
return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -294,10 +298,37 @@ struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerMostDimPreserver<0, Self, Op, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
|
||||
for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
|
||||
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
|
||||
reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer0, typename Self::PacketReturnType* accum0) {
|
||||
using Index = typename Self::Index;
|
||||
const Index stride = self.m_reducedStrides[0];
|
||||
const Index size = self.m_reducedDims[0];
|
||||
if (size >= 16) {
|
||||
const Index unrolled_size4 = (size / 4) * 4;
|
||||
typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
|
||||
for (Index j = 0; j < unrolled_size4; j += 4) {
|
||||
const Index input0 = firstIndex + j * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
|
||||
const Index input1 = firstIndex + (j+1) * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
|
||||
const Index input2 = firstIndex + (j+2) * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
|
||||
const Index input3 = firstIndex + (j+3) * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
|
||||
}
|
||||
reducer0.reducePacket(accum1, accum0);
|
||||
reducer0.reducePacket(accum2, accum0);
|
||||
reducer0.reducePacket(accum3, accum0);
|
||||
for (Index j = unrolled_size4; j < size; ++j) {
|
||||
Index input = firstIndex + j * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
|
||||
}
|
||||
} else {
|
||||
for (Index j = 0; j < size; ++j) {
|
||||
Index input = firstIndex + j * stride;
|
||||
reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -353,15 +384,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
|
||||
self.m_impl.costPerCoeff(Vectorizable) +
|
||||
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
||||
PacketSize);
|
||||
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
num_coeffs, cost, device.numThreads());
|
||||
if (num_threads == 1) {
|
||||
*output =
|
||||
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||
return;
|
||||
}
|
||||
const Index blocksize =
|
||||
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const Index blocksize = num_coeffs / num_threads;
|
||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user