mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-15 05:05:58 +08:00
Improved the performance of full reductions on GPU:
Before: BM_fullReduction/10 200000 11751 8.51 MFlops/s BM_fullReduction/80 5000 523385 12.23 MFlops/s BM_fullReduction/640 50 36179326 11.32 MFlops/s BM_fullReduction/4K 1 2173517195 11.50 MFlops/s After: BM_fullReduction/10 500000 5987 16.70 MFlops/s BM_fullReduction/80 200000 10636 601.73 MFlops/s BM_fullReduction/640 50000 58428 7010.31 MFlops/s BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s
This commit is contained in:
parent
c3859a2b58
commit
4670d7d5ce
@ -322,6 +322,12 @@ struct OuterReducer {
|
|||||||
template <int B, int N, typename S, typename R, typename I>
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
||||||
|
|
||||||
|
|
||||||
|
template <typename S, typename R, typename I>
|
||||||
|
__global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*);
|
||||||
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
|
__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||||
|
|
||||||
template <int NPT, typename S, typename R, typename I>
|
template <int NPT, typename S, typename R, typename I>
|
||||||
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||||
|
|
||||||
@ -618,6 +624,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
#endif
|
#endif
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
||||||
|
template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*);
|
||||||
|
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||||
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||||
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||||
#endif
|
#endif
|
||||||
|
@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <template <typename T> class R>
|
||||||
|
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
|
||||||
|
#if __CUDA_ARCH__ >= 300
|
||||||
|
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||||
|
unsigned int newval = oldval;
|
||||||
|
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||||
|
if (newval == oldval) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unsigned int readback;
|
||||||
|
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
|
||||||
|
oldval = readback;
|
||||||
|
newval = oldval;
|
||||||
|
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||||
|
if (newval == oldval) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
|
__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
@ -108,7 +132,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
reducer.reduce(__shfl_down(accum, offset), &accum);
|
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
@ -117,28 +141,78 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename Self, typename Op, bool Vectorizable>
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
template <typename Self,
|
||||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
typename Reducer, typename Index>
|
||||||
// so reduce the scope of the optimized version of the code to the simple case
|
__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
|
||||||
// of floats.
|
eigen_assert(threadIdx.x == 1);
|
||||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
if (num_coeffs % 2 != 0) {
|
||||||
internal::is_same<typename Self::CoeffReturnType, float>::value;
|
half last = input.m_impl.coeff(num_coeffs-1);
|
||||||
|
*scratch = __halves2half2(last, reducer.initialize());
|
||||||
|
} else {
|
||||||
|
*scratch = reducer.template initializePacket<half2>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename OutputType>
|
template <int BlockSize, int NumPerThread, typename Self,
|
||||||
static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
|
typename Reducer, typename Index>
|
||||||
assert(false && "Should only be called on floats");
|
static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
|
||||||
|
half* output, half2* scratch) {
|
||||||
|
eigen_assert(NumPerThread % 2 == 0);
|
||||||
|
|
||||||
|
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
||||||
|
|
||||||
|
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||||
|
if (gridDim.x == 1 && first_index == 0) {
|
||||||
|
if (num_coeffs % 2 != 0) {
|
||||||
|
half last = input.m_impl.coeff(num_coeffs-1);
|
||||||
|
*scratch = __halves2half2(last, reducer.initialize());
|
||||||
|
} else {
|
||||||
|
*scratch = reducer.template initializePacket<half2>();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
|
half2 accum = reducer.template initializePacket<half2>();
|
||||||
|
Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
|
||||||
|
for (Index i = 0; i < max_iter; i += BlockSize) {
|
||||||
|
const Index index = first_index + 2*i;
|
||||||
|
eigen_assert(index + 1 < num_coeffs);
|
||||||
|
half2 val = input.m_impl.template packet<Unaligned>(index);
|
||||||
|
reducer.reducePacket(val, &accum);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
|
atomicReduce(scratch, accum, reducer);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (gridDim.x == 1 && first_index == 0) {
|
||||||
|
reducer.reduce(__low2half(*scratch), output);
|
||||||
|
reducer.reduce(__high2half(*scratch), output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Op>
|
||||||
|
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
||||||
|
eigen_assert(threadIdx.x == 1);
|
||||||
|
reducer.reduce(__low2half(*scratch), output);
|
||||||
|
reducer.reduce(__high2half(*scratch), output);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
template <typename Self, typename Op, bool is_half>
|
||||||
|
struct Launcher {
|
||||||
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, typename Self::CoeffReturnType* output, typename Self::Index num_coeffs) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::CoeffReturnType Scalar;
|
||||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
|
||||||
// Don't crash when we're called with an input tensor of size 0.
|
|
||||||
if (num_coeffs == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||||
@ -155,6 +229,64 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
|
template <typename Self, typename Op>
|
||||||
|
struct Launcher<Self, Op, true> {
|
||||||
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
|
||||||
|
const int block_size = 256;
|
||||||
|
const int num_per_thread = 128;
|
||||||
|
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||||
|
half2* scratch = static_cast<half2*>(device.scratchpad());
|
||||||
|
|
||||||
|
if (num_blocks > 1) {
|
||||||
|
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
|
||||||
|
// won't be a race conditions between multiple thread blocks.
|
||||||
|
LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||||
|
1, 1, 0, device, reducer, self, num_coeffs, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
|
||||||
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
|
||||||
|
|
||||||
|
if (num_blocks > 1) {
|
||||||
|
LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
|
||||||
|
1, 1, 0, device, reducer, output, scratch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
template <typename Self, typename Op, bool Vectorizable>
|
||||||
|
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||||
|
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||||
|
// so reduce the scope of the optimized version of the code to the simple case
|
||||||
|
// of floats and half floats.
|
||||||
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
|
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||||
|
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||||
|
internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value);
|
||||||
|
#else
|
||||||
|
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||||
|
internal::is_same<typename Self::CoeffReturnType, float>::value;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename OutputType>
|
||||||
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||||
|
assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
|
||||||
|
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||||
|
// Don't crash when we're called with an input tensor of size 0.
|
||||||
|
if (num_coeffs == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const bool is_half = internal::is_same<typename Self::CoeffReturnType, half>::value;
|
||||||
|
Launcher<Self, Op, is_half>::run(self, reducer, device, output, num_coeffs);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
template <int NumPerThread, typename Self,
|
template <int NumPerThread, typename Self,
|
||||||
typename Reducer, typename Index>
|
typename Reducer, typename Index>
|
||||||
@ -323,7 +455,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 16;
|
const int num_per_thread = 16;
|
||||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user