Added the ability to use a scratch buffer in cuda kernels

This commit is contained in:
Benoit Steiner 2016-05-09 17:05:53 -07:00
parent ba95e43ea2
commit c3859a2b58

View File

@ -24,6 +24,9 @@ class StreamInterface {
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
// Return a scratchpad buffer of size 1k
virtual void* scratchpad() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
CudaStreamDevice() : stream_(&default_stream) {
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
: stream_(stream), device_(device) {
: stream_(stream), device_(device), scratch_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface {
initializeDeviceProp();
}
virtual ~CudaStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_];
@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface {
assert(err == cudaSuccess);
}
virtual void* scratchpad() const {
if (scratch_ == NULL) {
scratch_ = allocate(1024);
}
return scratch_;
}
private:
const cudaStream_t* stream_;
int device_;
mutable void* scratch_;
};
struct GpuDevice {
@ -143,12 +160,20 @@ struct GpuDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
stream_->deallocate(buffer);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
#ifndef __CUDA_ARCH__
return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
return NULL;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,