Clean up gpu device properties.

Made a class and singleton to encapsulate initialization and retrieval of device properties. Related to !481, which already changed the API to address a static linkage issue.
2025-10-21 04:21:07 +08:00 · 2021-05-06 12:50:51 -07:00 · 2021-05-06 12:50:51 -07:00 · 0eba8a1fe3
commit 0eba8a1fe3
parent 90e9a33e1c
1 changed files with 69 additions and 47 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -42,58 +42,84 @@ class StreamInterface {
  virtual unsigned int* semaphore() const = 0;
 };
-EIGEN_STRONG_INLINE gpuDeviceProp_t*& getDeviceProperties() {
+class GpuDeviceProperties {
-  static gpuDeviceProp_t* deviceProperties;
+ public:
-  return deviceProperties;
+  GpuDeviceProperties() : 
-}
+      initialized_(false), first_(true), device_properties_(nullptr) {}
  ~GpuDeviceProperties() {
    if (device_properties_) {
      delete[] device_properties_;
    }
  }
  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
    return device_properties_[device];
  }
-EIGEN_STRONG_INLINE bool& getDevicePropInitialized() {
+  EIGEN_STRONG_INLINE bool isInitialized() const {
-  static bool devicePropInitialized = false;
+    return initialized_;
-  return devicePropInitialized;
+  }
 }
-static void initializeDeviceProp() {
+  void initialize() {
-  if (!getDevicePropInitialized()) {
+    if (!initialized_) {
-    // Attempts to ensure proper behavior in the case of multiple threads
+      // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
+      // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
+      // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
+      // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
+      // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
+      // can't ensure that the initialization is thread safe.
-    static std::atomic<bool> first(true);
+      if (first_.exchange(false)) {
-    if (first.exchange(false)) {
+        // We're the first thread to reach this point.
-      // We're the first thread to reach this point.
+        int num_devices;
-      int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
      gpuError_t status = gpuGetDeviceCount(&num_devices);
      if (status != gpuSuccess) {
        std::cerr << "Failed to get the number of GPU devices: "
                  << gpuGetErrorString(status)
                  << std::endl;
        gpu_assert(status == gpuSuccess);
      }
      getDeviceProperties() = new gpuDeviceProp_t[num_devices];
      for (int i = 0; i < num_devices; ++i) {
        status = gpuGetDeviceProperties(&getDeviceProperties()[i], i);
        if (status != gpuSuccess) {
-          std::cerr << "Failed to initialize GPU device #"
+          std::cerr << "Failed to get the number of GPU devices: "
                    << i
                    << ": "
                    << gpuGetErrorString(status)
                    << std::endl;
          gpu_assert(status == gpuSuccess);
        }
-      }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
        for (int i = 0; i < num_devices; ++i) {
          status = gpuGetDeviceProperties(&device_properties_[i], i);
          if (status != gpuSuccess) {
            std::cerr << "Failed to initialize GPU device #"
                      << i
                      << ": "
                      << gpuGetErrorString(status)
                      << std::endl;
            gpu_assert(status == gpuSuccess);
          }
        }
-      std::atomic_thread_fence(std::memory_order_release);
+        std::atomic_thread_fence(std::memory_order_release);
-      getDevicePropInitialized() = true;
+        initialized_ = true;
-    } else {
+      } else {
-      // Wait for the other thread to inititialize the properties.
+        // Wait for the other thread to inititialize the properties.
-      while (!getDevicePropInitialized()) {
+        while (!initialized_) {
-        std::atomic_thread_fence(std::memory_order_acquire);
+          std::atomic_thread_fence(std::memory_order_acquire);
-        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
        }
      }
    }
  }
 private:
  volatile bool initialized_;
  std::atomic<bool> first_;
  gpuDeviceProp_t* device_properties_;
 };
 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
  if (!deviceProperties->isInitialized()) {
    deviceProperties->initialize();
  }
  return *deviceProperties;
 }
 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
  return GetGpuDeviceProperties().get(device);
 }
 static const gpuStream_t default_stream = gpuStreamDefault;
@ -103,12 +129,9 @@ class GpuStreamDevice : public StreamInterface {
  // Use the default stream on the current device
  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
    gpuGetDevice(&device_);
    initializeDeviceProp();
  }
  // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
    initializeDeviceProp();
  }
  // Use the specified stream. Note that it's the
  // caller responsibility to ensure that the stream can run on
  // the specified device. If no device is specified the code
@ -125,7 +148,6 @@ class GpuStreamDevice : public StreamInterface {
      gpu_assert(device < num_devices);
      device_ = device;
    }
    initializeDeviceProp();
  }
  virtual ~GpuStreamDevice() {
@ -136,7 +158,7 @@ class GpuStreamDevice : public StreamInterface {
  const gpuStream_t& stream() const { return *stream_; }
  const gpuDeviceProp_t& deviceProperties() const {
-    return getDeviceProperties()[device_];
+    return GetGpuDeviceProperties(device_);
  }
  virtual void* allocate(size_t num_bytes) const {
    gpuError_t err = gpuSetDevice(device_);