From 08348b4e487547ea4fa035208eb09a955cca05fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 11 May 2016 10:08:51 -0700 Subject: [PATCH] Fix potential race condition in the CUDA reduction code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 6db9e63c8..bbac88192 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -121,6 +121,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Initialize the output value if it wasn't initialized by the ReductionInitKernel if (gridDim.x == 1 && first_index == 0) { *output = reducer.initialize(); + __syncthreads(); } typename Self::CoeffReturnType accum = reducer.initialize(); @@ -172,6 +173,7 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self } else { *scratch = reducer.template initializePacket(); } + __syncthreads(); } half2 accum = reducer.template initializePacket(); @@ -316,6 +318,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { output[i] = reducer.initialize(); } + _syncthreads(); } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { @@ -420,6 +423,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { output[i] = reducer.initialize(); } + __syncthreads(); } // Do the reduction.