mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Improved the performance of full reductions.
AFTER: BM_fullReduction/10 4541 4543 154017 21.0M items/s BM_fullReduction/64 5191 5193 100000 752.5M items/s BM_fullReduction/512 9588 9588 71361 25.5G items/s BM_fullReduction/4k 244314 244281 2863 64.0G items/s BM_fullReduction/5k 359382 359363 1946 64.8G items/s BEFORE: BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s
This commit is contained in:
parent
8d97ba6b22
commit
c2a102345f
@ -12,6 +12,8 @@
|
|||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
|
static const int kCudaScratchSize = 1024;
|
||||||
|
|
||||||
// This defines an interface that GPUDevice can take to use
|
// This defines an interface that GPUDevice can take to use
|
||||||
// CUDA streams underneath.
|
// CUDA streams underneath.
|
||||||
class StreamInterface {
|
class StreamInterface {
|
||||||
@ -27,6 +29,12 @@ class StreamInterface {
|
|||||||
|
|
||||||
// Return a scratchpad buffer of size 1k
|
// Return a scratchpad buffer of size 1k
|
||||||
virtual void* scratchpad() const = 0;
|
virtual void* scratchpad() const = 0;
|
||||||
|
|
||||||
|
// Return a semaphore. The semaphore is initially initialized to 0, and
|
||||||
|
// each kernel using it is responsible for resetting to 0 upon completion
|
||||||
|
// to maintain the invariant that the semaphore is always equal to 0 upon
|
||||||
|
// each kernel start.
|
||||||
|
virtual unsigned int* semaphore() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
static cudaDeviceProp* m_deviceProperties;
|
static cudaDeviceProp* m_deviceProperties;
|
||||||
@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
|
|||||||
class CudaStreamDevice : public StreamInterface {
|
class CudaStreamDevice : public StreamInterface {
|
||||||
public:
|
public:
|
||||||
// Use the default stream on the current device
|
// Use the default stream on the current device
|
||||||
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
|
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
|
||||||
cudaGetDevice(&device_);
|
cudaGetDevice(&device_);
|
||||||
initializeDeviceProp();
|
initializeDeviceProp();
|
||||||
}
|
}
|
||||||
// Use the default stream on the specified device
|
// Use the default stream on the specified device
|
||||||
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
|
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||||
initializeDeviceProp();
|
initializeDeviceProp();
|
||||||
}
|
}
|
||||||
// Use the specified stream. Note that it's the
|
// Use the specified stream. Note that it's the
|
||||||
@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface {
|
|||||||
// the specified device. If no device is specified the code
|
// the specified device. If no device is specified the code
|
||||||
// assumes that the stream is associated to the current gpu device.
|
// assumes that the stream is associated to the current gpu device.
|
||||||
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
|
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
|
||||||
: stream_(stream), device_(device), scratch_(NULL) {
|
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||||
if (device < 0) {
|
if (device < 0) {
|
||||||
cudaGetDevice(&device_);
|
cudaGetDevice(&device_);
|
||||||
} else {
|
} else {
|
||||||
@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface {
|
|||||||
|
|
||||||
virtual void* scratchpad() const {
|
virtual void* scratchpad() const {
|
||||||
if (scratch_ == NULL) {
|
if (scratch_ == NULL) {
|
||||||
scratch_ = allocate(1024);
|
scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
|
||||||
}
|
}
|
||||||
return scratch_;
|
return scratch_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual unsigned int* semaphore() const {
|
||||||
|
if (semaphore_ == NULL) {
|
||||||
|
char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
|
||||||
|
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
||||||
|
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||||
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
|
assert(err == cudaSuccess);
|
||||||
|
}
|
||||||
|
return semaphore_;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const cudaStream_t* stream_;
|
const cudaStream_t* stream_;
|
||||||
int device_;
|
int device_;
|
||||||
mutable void* scratch_;
|
mutable void* scratch_;
|
||||||
|
mutable unsigned int* semaphore_;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GpuDevice {
|
struct GpuDevice {
|
||||||
@ -174,6 +194,15 @@ struct GpuDevice {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
|
||||||
|
#ifndef __CUDA_ARCH__
|
||||||
|
return stream_->semaphore();
|
||||||
|
#else
|
||||||
|
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||||
|
@ -316,7 +316,7 @@ struct OuterReducer {
|
|||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||||
template <int B, int N, typename S, typename R, typename I>
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
|
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
@ -616,7 +616,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||||
#endif
|
#endif
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
|
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||||
|
@ -112,17 +112,40 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <int BlockSize, int NumPerThread, typename Self,
|
template <int BlockSize, int NumPerThread, typename Self,
|
||||||
typename Reducer, typename Index>
|
typename Reducer, typename Index>
|
||||||
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||||
typename Self::CoeffReturnType* output) {
|
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
||||||
|
// Initialize the output value
|
||||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||||
|
if (gridDim.x == 1) {
|
||||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
if (first_index == 0) {
|
||||||
if (gridDim.x == 1 && first_index == 0) {
|
*output = reducer.initialize();
|
||||||
*output = reducer.initialize();
|
}
|
||||||
__syncthreads();
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
unsigned int block = atomicCAS(semaphore, 0u, 1u);
|
||||||
|
if (block == 0) {
|
||||||
|
// We're the first block to run, initialize the output value
|
||||||
|
atomicExch(output, reducer.initialize());
|
||||||
|
unsigned int old = atomicExch(semaphore, 2u);
|
||||||
|
assert(old == 1u);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Use atomicCAS here to ensure that the reads aren't cached
|
||||||
|
unsigned int val = atomicCAS(semaphore, 2u, 2u);
|
||||||
|
while (val < 2u) {
|
||||||
|
val = atomicCAS(semaphore, 2u, 2u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
|
||||||
|
|
||||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||||
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
|
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
|
||||||
@ -141,6 +164,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
atomicReduce(output, accum, reducer);
|
atomicReduce(output, accum, reducer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (gridDim.x > 1 && threadIdx.x == 0) {
|
||||||
|
unsigned int ticket = atomicInc(semaphore, UINT_MAX);
|
||||||
|
assert(ticket >= 2u);
|
||||||
|
if (ticket == gridDim.x + 1) {
|
||||||
|
// We're the last block, reset the semaphore
|
||||||
|
*semaphore = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -246,15 +278,13 @@ struct FullReductionLauncher<Self, Op, float, PacketAccess> {
|
|||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||||
|
|
||||||
|
unsigned int* semaphore = NULL;
|
||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
semaphore = device.semaphore();
|
||||||
// won't be a race conditions between multiple thread blocks.
|
|
||||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
|
|
||||||
1, 32, 0, device, reducer.initialize(), 1, output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user