diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index f13ee48e9..87186e1b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -227,13 +227,19 @@ struct InnerMostDimReducer<Self, Op, true, false> {
 // The following implements tree-based reduction, which improves the accuracy
 // of sum and mean reductions, since each of the n inputs only participates in
 // O(log n) additions.
-static const int kLeafSize = 1024;
+template <typename T>
+EIGEN_DEVICE_FUNC inline Index LeafSize() { return 1024; }
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<half>() { return 200; }
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() { return 128; }
 
 template <typename Self, typename Op>
 struct InnerMostDimReducer<Self, Op, false, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
   reduce(const Self& self, typename Self::Index firstIndex,
          typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
     typename Self::CoeffReturnType accum = reducer.initialize();
     if (numValuesToReduce > kLeafSize) {
       const typename Self::Index half = numValuesToReduce / 2;
@@ -254,6 +260,7 @@ struct InnerMostDimReducer<Self, Op, true, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
   reduce(const Self& self, typename Self::Index firstIndex,
          typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
     const typename Self::Index packetSize =
         internal::unpacket_traits<typename Self::PacketReturnType>::size;
     typename Self::CoeffReturnType accum = reducer.initialize();
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index c46c4c91d..b7611d7b0 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -486,22 +486,25 @@ static void test_reduce_middle_dims() {
   }
 }
 
-static void test_sum_accuracy() {
-  Tensor<float, 3> tensor(101, 101, 101);
-  for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
-    tensor.setRandom();
-    tensor += tensor.constant(prescribed_mean);
+template <typename ScalarType, int num_elements, int max_mean>
+void test_sum_accuracy() {
+  Tensor<double, 1> double_tensor(num_elements);
+  Tensor<ScalarType, 1> tensor(num_elements);
+  for (double prescribed_mean = 0; prescribed_mean <= max_mean; prescribed_mean = numext::maxi(1.0, prescribed_mean*3.99)) {
+    // FIXME: NormalRandomGenerator doesn't work in bfloat and half.
+    double_tensor.setRandom<Eigen::internal::NormalRandomGenerator<double>>();
+    double_tensor += double_tensor.constant(prescribed_mean);
+    tensor = double_tensor.cast<ScalarType>();
 
-    Tensor<float, 0> sum = tensor.sum();
+    Tensor<ScalarType, 0> sum;
+    sum = tensor.sum();
+
+    // Compute the reference value in double precsion.
     double expected_sum = 0.0;
-    for (int i = 0; i < 101; ++i) {
-      for (int j = 0; j < 101; ++j) {
-        for (int k = 0; k < 101; ++k) {
-          expected_sum += static_cast<double>(tensor(i, j, k));
-        }
-      }
+    for (int i = 0; i < num_elements; ++i) {
+      expected_sum += static_cast<double>(tensor(i));
     }
-    VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
+    VERIFY_IS_APPROX(sum(), static_cast<ScalarType>(expected_sum));
   }
 }
 
@@ -528,5 +531,11 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
   CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
-  CALL_SUBTEST(test_sum_accuracy());
+  CALL_SUBTEST((test_sum_accuracy<float,10*1024*1024,8*1024>()));
+  CALL_SUBTEST((test_sum_accuracy<Eigen::bfloat16,10*1024*1024,8*1024>()));
+  // The range of half is limited to 65519 when using round-to-even,
+  // so we are severely limited in the size and mean of the tensors
+  // we can reduce without overflow.
+  CALL_SUBTEST((test_sum_accuracy<Eigen::half,4*1024,16>()));
+  CALL_SUBTEST((test_sum_accuracy<Eigen::half,10*1024*1024,0>()));
 }