Added support for multi gpu configuration to the GpuDevice class

2025-09-27 00:33:24 +08:00 · 2015-07-15 12:38:34 -07:00 · 2015-07-15 12:38:34 -07:00 · e892524efe
commit e892524efe
parent f5aa640862
3 changed files with 199 additions and 58 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -835,10 +835,10 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  void executeEval(Scalar* data) const {
    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;

-    const int maxSharedMem = sharedMemPerBlock();
-    const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = getNumCudaMultiProcessors();
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
    const int warpSize = 32;

    switch (NumKernelDims) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@ -15,16 +15,22 @@ namespace Eigen {

 // Default device for the machine (typically a single cpu core)
 struct DefaultDevice {
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    return internal::aligned_malloc(num_bytes);
  }
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    internal::aligned_free(buffer);
  }
-  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
  }
-  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }

@ -208,6 +214,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {

 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }

  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
@ -221,6 +228,12 @@ struct ThreadPoolDevice {
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }

  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
@ -259,81 +272,164 @@ struct ThreadPoolDevice {

 // GPU offloading
 #ifdef EIGEN_USE_GPU
-static cudaDeviceProp m_deviceProperties;
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const cudaStream_t& stream() const = 0;
+  virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;

 static void initializeDeviceProp() {
  if (!m_devicePropInitialized) {
-    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
-    m_devicePropInitialized = true;
+    if (!m_devicePropInitialized) {
+      int num_devices;
+      cudaError_t status = cudaGetDeviceCount(&num_devices);
+      eigen_check(status == cudaSuccess);
+      m_deviceProperties = new cudaDeviceProp[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+        eigen_check(status == cudaSuccess);
+      }
+      m_devicePropInitialized = true;
+    }
  }
 }

-static inline int getNumCudaMultiProcessors() {
-  initializeDeviceProp();
-  return m_deviceProperties.multiProcessorCount;
-}
-static inline int maxCudaThreadsPerBlock() {
-  initializeDeviceProp();
-  return m_deviceProperties.maxThreadsPerBlock;
-}
-static inline int maxCudaThreadsPerMultiProcessor() {
-  initializeDeviceProp();
-  return m_deviceProperties.maxThreadsPerMultiProcessor;
-}
-static inline int sharedMemPerBlock() {
-  initializeDeviceProp();
-  return m_deviceProperties.sharedMemPerBlock;
-}
-
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
-  assert(status == cudaSuccess);
-}
-
-// Cuda stream to use when no stream is specified explicitely.
 static const cudaStream_t default_stream = cudaStreamDefault;

-struct GpuDevice {
-  // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
-  GpuDevice(const cudaStream_t* stream = &default_stream) : stream_(stream) { eigen_assert(stream); }
+class CudaStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  CudaStreamDevice() : stream_(&default_stream) {
+    cudaGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+      : stream_(stream), device_(device) {
+    if (device < 0) {
+      cudaGetDevice(&device_);
+    } else {
+      int num_devices;
+      cudaError_t err = cudaGetDeviceCount(&num_devices);
+      eigen_check(err == cudaSuccess);
+      eigen_check(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }

-  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+  const cudaStream_t& stream() const { return *stream_; }
+  const cudaDeviceProp& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    void* result;
+    err = cudaMalloc(&result, num_bytes);
+    eigen_check(err == cudaSuccess);
+    eigen_check(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    cudaError_t err = cudaSetDevice(device_);
+    eigen_check(err == cudaSuccess);
+    assert(buffer != NULL);
+    err = cudaFree(buffer);
+    assert(err == cudaSuccess);
+  }
+
+ private:
+  const cudaStream_t* stream_;
+  int device_;
+};
+
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+    eigen_assert(stream);
+  }
+
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+    return stream_->stream();
+  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
 #ifndef __CUDA_ARCH__
-    void* result;
-    assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
-    assert(result != NULL);
-    return result;
+    return stream_->allocate(num_bytes);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return NULL;
 #endif
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
-    assert(buffer != NULL);
-    assert(cudaFree(buffer) == cudaSuccess);
+    stream_->deallocate(buffer);
+
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
-    assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
+    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+                                      stream_->stream());
+    assert(err == cudaSuccess);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
 #ifndef __CUDA_ARCH__
-    assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
+    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    assert(err == cudaSuccess);
 #else
-    assert(false && "The default device should be used instead to generate kernel code");
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

@ -342,21 +438,66 @@ struct GpuDevice {
    return 32;
  }

- inline int majorDeviceVersion() const { return m_deviceProperties.major; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-    cudaStreamSynchronize(*stream_);
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    assert(err == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  inline int getNumCudaMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  inline int maxCudaThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  inline int maxCudaThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  inline int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  inline int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+
+  // This function checks if the CUDA runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+    cudaError_t error = cudaStreamQuery(stream_->stream());
+    return (error == cudaSuccess) || (error == cudaErrorNotReady);
  }

 private:
-  // TODO: multigpu.
-  const cudaStream_t* stream_;
+  const StreamInterface* stream_;
+
 };

+
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
  assert(cudaGetLastError() == cudaSuccess);

+
+// FIXME: Should be device and kernel specific.
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  eigen_check(status == cudaSuccess);
+}
+
 #endif

 }  // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -205,8 +205,8 @@ class TensorExecutor<Expression, GpuDevice, false>
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign)
    {
-      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
-      const int block_size = maxCudaThreadsPerBlock();
+      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+      const int block_size = device.maxCudaThreadsPerBlock();
      const Index size = array_prod(evaluator.dimensions());
      LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
    }
@ -225,8 +225,8 @@ class TensorExecutor<Expression, GpuDevice, true>
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign)
    {
-      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
-      const int block_size = maxCudaThreadsPerBlock();
+      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+      const int block_size = device.maxCudaThreadsPerBlock();
      const Index size = array_prod(evaluator.dimensions());
      LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
    }