Small improvement to the full reduction of fp16

This commit is contained in:
Benoit Steiner 2016-05-10 11:58:18 -07:00
parent 0b9e3dcd06
commit 0eb69b7552

View File

@ -193,16 +193,18 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self
__syncthreads(); __syncthreads();
if (gridDim.x == 1 && first_index == 0) { if (gridDim.x == 1 && first_index == 0) {
reducer.reduce(__low2half(*scratch), output); half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), output); reducer.reduce(__high2half(*scratch), &tmp);
*output = tmp;
} }
} }
template <typename Op> template <typename Op>
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
eigen_assert(threadIdx.x == 1); eigen_assert(threadIdx.x == 1);
reducer.reduce(__low2half(*scratch), output); half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), output); reducer.reduce(__high2half(*scratch), &tmp);
*output = tmp;
} }
#endif #endif