From 7ef9f47b5874c33d15649a3312d463ecbd290365 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 14:09:46 -0700 Subject: [PATCH] Misc small improvements to the reduction code. --- .../CXX11/src/Tensor/TensorReductionCuda.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 0d1a098b7..e82530955 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if (block == 0) { // We're the first block to run, initialize the output value atomicExch(output, reducer.initialize()); - unsigned int old = atomicExch(semaphore, 2u); - assert(old == 1u); + __threadfence(); + atomicExch(semaphore, 2u); } else { + // Wait for the first block to initialize the output value. // Use atomicCAS here to ensure that the reads aren't cached - unsigned int val = atomicCAS(semaphore, 2u, 2u); - while (val < 2u) { + unsigned int val; + do { val = atomicCAS(semaphore, 2u, 2u); } + while (val < 2u); } } } @@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } if (gridDim.x > 1 && threadIdx.x == 0) { - unsigned int ticket = atomicInc(semaphore, UINT_MAX); - assert(ticket >= 2u); - if (ticket == gridDim.x + 1) { - // We're the last block, reset the semaphore - *semaphore = 0; - } + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); } }