From 0eb69b7552efe0194c2c96ce8dee09176c49c231 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 May 2016 11:58:18 -0700
Subject: [PATCH] Small improvement to the full reduction of fp16

---
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index b18200166..afa1a2697 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -193,16 +193,18 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self
   __syncthreads();
 
   if (gridDim.x == 1 && first_index == 0) {
-    reducer.reduce(__low2half(*scratch), output);
-    reducer.reduce(__high2half(*scratch), output);
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
   }
 }
 
 template <typename Op>
 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
   eigen_assert(threadIdx.x == 1);
-  reducer.reduce(__low2half(*scratch), output);
-  reducer.reduce(__high2half(*scratch), output);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
 }
 
 #endif