diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index f13ee48e9..87186e1b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -227,13 +227,19 @@ struct InnerMostDimReducer { // The following implements tree-based reduction, which improves the accuracy // of sum and mean reductions, since each of the n inputs only participates in // O(log n) additions. -static const int kLeafSize = 1024; +template +EIGEN_DEVICE_FUNC inline Index LeafSize() { return 1024; } +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { return 200; } +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { return 128; } template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); typename Self::CoeffReturnType accum = reducer.initialize(); if (numValuesToReduce > kLeafSize) { const typename Self::Index half = numValuesToReduce / 2; @@ -254,6 +260,7 @@ struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); const typename Self::Index packetSize = internal::unpacket_traits::size; typename Self::CoeffReturnType accum = reducer.initialize(); diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index c46c4c91d..b7611d7b0 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -486,22 +486,25 @@ static void test_reduce_middle_dims() { } } -static void test_sum_accuracy() { - Tensor tensor(101, 101, 101); - for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) { - tensor.setRandom(); - tensor += tensor.constant(prescribed_mean); +template +void test_sum_accuracy() { + Tensor double_tensor(num_elements); + Tensor tensor(num_elements); + for (double prescribed_mean = 0; prescribed_mean <= max_mean; prescribed_mean = numext::maxi(1.0, prescribed_mean*3.99)) { + // FIXME: NormalRandomGenerator doesn't work in bfloat and half. + double_tensor.setRandom>(); + double_tensor += double_tensor.constant(prescribed_mean); + tensor = double_tensor.cast(); - Tensor sum = tensor.sum(); + Tensor sum; + sum = tensor.sum(); + + // Compute the reference value in double precsion. double expected_sum = 0.0; - for (int i = 0; i < 101; ++i) { - for (int j = 0; j < 101; ++j) { - for (int k = 0; k < 101; ++k) { - expected_sum += static_cast(tensor(i, j, k)); - } - } + for (int i = 0; i < num_elements; ++i) { + expected_sum += static_cast(tensor(i)); } - VERIFY_IS_APPROX(sum(), static_cast(expected_sum)); + VERIFY_IS_APPROX(sum(), static_cast(expected_sum)); } } @@ -528,5 +531,11 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reduction) { CALL_SUBTEST(test_innermost_first_dims()); CALL_SUBTEST(test_reduce_middle_dims()); CALL_SUBTEST(test_reduce_middle_dims()); - CALL_SUBTEST(test_sum_accuracy()); + CALL_SUBTEST((test_sum_accuracy())); + CALL_SUBTEST((test_sum_accuracy())); + // The range of half is limited to 65519 when using round-to-even, + // so we are severely limited in the size and mean of the tensors + // we can reduce without overflow. + CALL_SUBTEST((test_sum_accuracy())); + CALL_SUBTEST((test_sum_accuracy())); }