Added the ability to use a scratch buffer in cuda kernels

This commit is contained in:
Benoit Steiner 2016-05-09 17:05:53 -07:00
parent ba95e43ea2
commit c3859a2b58

View File

@ -24,6 +24,9 @@ class StreamInterface {
// Allocate memory on the actual device where the computation will run // Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0; virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0; virtual void deallocate(void* buffer) const = 0;
// Return a scratchpad buffer of size 1k
virtual void* scratchpad() const = 0;
}; };
static cudaDeviceProp* m_deviceProperties; static cudaDeviceProp* m_deviceProperties;
@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface { class CudaStreamDevice : public StreamInterface {
public: public:
// Use the default stream on the current device // Use the default stream on the current device
CudaStreamDevice() : stream_(&default_stream) { CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
cudaGetDevice(&device_); cudaGetDevice(&device_);
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the default stream on the specified device // Use the default stream on the specified device
CudaStreamDevice(int device) : stream_(&default_stream), device_(device) { CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the specified stream. Note that it's the // Use the specified stream. Note that it's the
@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code // the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device. // assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1) CudaStreamDevice(const cudaStream_t* stream, int device = -1)
: stream_(stream), device_(device) { : stream_(stream), device_(device), scratch_(NULL) {
if (device < 0) { if (device < 0) {
cudaGetDevice(&device_); cudaGetDevice(&device_);
} else { } else {
@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface {
initializeDeviceProp(); initializeDeviceProp();
} }
virtual ~CudaStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
const cudaStream_t& stream() const { return *stream_; } const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const { const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_]; return m_deviceProperties[device_];
@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface {
assert(err == cudaSuccess); assert(err == cudaSuccess);
} }
virtual void* scratchpad() const {
if (scratch_ == NULL) {
scratch_ = allocate(1024);
}
return scratch_;
}
private: private:
const cudaStream_t* stream_; const cudaStream_t* stream_;
int device_; int device_;
mutable void* scratch_;
}; };
struct GpuDevice { struct GpuDevice {
@ -143,12 +160,20 @@ struct GpuDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
stream_->deallocate(buffer); stream_->deallocate(buffer);
#else #else
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
#ifndef __CUDA_ARCH__
return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
return NULL;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,