From 4670d7d5ce2517b2e9201f1cf44ae62ef2284eb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 17:09:54 -0700 Subject: [PATCH] Improved the performance of full reductions on GPU: Before: BM_fullReduction/10 200000 11751 8.51 MFlops/s BM_fullReduction/80 5000 523385 12.23 MFlops/s BM_fullReduction/640 50 36179326 11.32 MFlops/s BM_fullReduction/4K 1 2173517195 11.50 MFlops/s After: BM_fullReduction/10 500000 5987 16.70 MFlops/s BM_fullReduction/80 200000 10636 601.73 MFlops/s BM_fullReduction/640 50000 58428 7010.31 MFlops/s BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 8 + .../CXX11/src/Tensor/TensorReductionCuda.h | 172 ++++++++++++++++-- 2 files changed, 160 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 885295f0a..97f4b34b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -322,6 +322,12 @@ struct OuterReducer { template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + +template +__global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); @@ -618,6 +624,8 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index fd2587dd5..9186dffe4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } + +template