Small improvement to the full reduction of fp16

2025-10-10 15:16:31 +08:00 · 2016-05-10 11:58:18 -07:00 · 2016-05-10 11:58:18 -07:00 · 0eb69b7552
commit 0eb69b7552
parent 0b9e3dcd06
1 changed files with 6 additions and 4 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -193,16 +193,18 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self
  __syncthreads();
  if (gridDim.x == 1 && first_index == 0) {
-    reducer.reduce(__low2half(*scratch), output);
+    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), output);
+    reducer.reduce(__high2half(*scratch), &tmp);
    *output = tmp;
  }
 }
 template <typename Op>
 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
  eigen_assert(threadIdx.x == 1);
-  reducer.reduce(__low2half(*scratch), output);
+  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), output);
+  reducer.reduce(__high2half(*scratch), &tmp);
  *output = tmp;
 }
 #endif