Added the ability to use a scratch buffer in cuda kernels

2025-07-11 23:51:49 +08:00 · 2016-05-09 17:05:53 -07:00 · 2016-05-09 17:05:53 -07:00 · c3859a2b58
commit c3859a2b58
parent ba95e43ea2
1 changed files with 29 additions and 4 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -24,6 +24,9 @@ class StreamInterface {
  // Allocate memory on the actual device where the computation will run
  virtual void* allocate(size_t num_bytes) const = 0;
  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
 };

 static cudaDeviceProp* m_deviceProperties;
@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
 public:
  // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
    cudaGetDevice(&device_);
    initializeDeviceProp();
  }
  // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
    initializeDeviceProp();
  }
  // Use the specified stream. Note that it's the
@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface {
  // the specified device. If no device is specified the code
  // assumes that the stream is associated to the current gpu device.
  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device) {
+      : stream_(stream), device_(device), scratch_(NULL) {
    if (device < 0) {
      cudaGetDevice(&device_);
    } else {
@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface {
    initializeDeviceProp();
  }

+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
  const cudaStream_t& stream() const { return *stream_; }
  const cudaDeviceProp& deviceProperties() const {
    return m_deviceProperties[device_];
@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface {
    assert(err == cudaSuccess);
  }

+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(1024);
+    }
+    return scratch_;
+  }
+
 private:
  const cudaStream_t* stream_;
  int device_;
+  mutable void* scratch_;
 };

 struct GpuDevice {
@ -143,12 +160,20 @@ struct GpuDevice {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
    stream_->deallocate(buffer);
-
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
+#ifndef __CUDA_ARCH__
+    return stream_->scratchpad();
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+    return NULL;
+  }
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,