Improve accuracy of full tensor reduction for half and bfloat16 by reducing leaf size in tree reduction.

Add more unit tests for summation accuracy.
This commit is contained in:
Rasmus Munk Larsen 2021-10-19 17:36:07 -07:00
parent 95bb645e92
commit 360290fc42
2 changed files with 31 additions and 15 deletions

View File

@ -227,13 +227,19 @@ struct InnerMostDimReducer<Self, Op, true, false> {
// The following implements tree-based reduction, which improves the accuracy
// of sum and mean reductions, since each of the n inputs only participates in
// O(log n) additions.
static const int kLeafSize = 1024;
template <typename T>
EIGEN_DEVICE_FUNC inline Index LeafSize() { return 1024; }
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<half>() { return 200; }
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() { return 128; }
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, false, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
reduce(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
typename Self::CoeffReturnType accum = reducer.initialize();
if (numValuesToReduce > kLeafSize) {
const typename Self::Index half = numValuesToReduce / 2;
@ -254,6 +260,7 @@ struct InnerMostDimReducer<Self, Op, true, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
reduce(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
const typename Self::Index packetSize =
internal::unpacket_traits<typename Self::PacketReturnType>::size;
typename Self::CoeffReturnType accum = reducer.initialize();

View File

@ -486,22 +486,25 @@ static void test_reduce_middle_dims() {
}
}
static void test_sum_accuracy() {
Tensor<float, 3> tensor(101, 101, 101);
for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
tensor.setRandom();
tensor += tensor.constant(prescribed_mean);
template <typename ScalarType, int num_elements, int max_mean>
void test_sum_accuracy() {
Tensor<double, 1> double_tensor(num_elements);
Tensor<ScalarType, 1> tensor(num_elements);
for (double prescribed_mean = 0; prescribed_mean <= max_mean; prescribed_mean = numext::maxi(1.0, prescribed_mean*3.99)) {
// FIXME: NormalRandomGenerator doesn't work in bfloat and half.
double_tensor.setRandom<Eigen::internal::NormalRandomGenerator<double>>();
double_tensor += double_tensor.constant(prescribed_mean);
tensor = double_tensor.cast<ScalarType>();
Tensor<float, 0> sum = tensor.sum();
Tensor<ScalarType, 0> sum;
sum = tensor.sum();
// Compute the reference value in double precsion.
double expected_sum = 0.0;
for (int i = 0; i < 101; ++i) {
for (int j = 0; j < 101; ++j) {
for (int k = 0; k < 101; ++k) {
expected_sum += static_cast<double>(tensor(i, j, k));
}
}
for (int i = 0; i < num_elements; ++i) {
expected_sum += static_cast<double>(tensor(i));
}
VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
VERIFY_IS_APPROX(sum(), static_cast<ScalarType>(expected_sum));
}
}
@ -528,5 +531,11 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
CALL_SUBTEST(test_sum_accuracy());
CALL_SUBTEST((test_sum_accuracy<float,10*1024*1024,8*1024>()));
CALL_SUBTEST((test_sum_accuracy<Eigen::bfloat16,10*1024*1024,8*1024>()));
// The range of half is limited to 65519 when using round-to-even,
// so we are severely limited in the size and mean of the tensors
// we can reduce without overflow.
CALL_SUBTEST((test_sum_accuracy<Eigen::half,4*1024,16>()));
CALL_SUBTEST((test_sum_accuracy<Eigen::half,10*1024*1024,0>()));
}