Clean up gpu device properties.

Made a class and singleton to encapsulate initialization and retrieval of device properties. Related to !481, which already changed the API to address a static linkage issue.
2025-07-13 00:21:49 +08:00 · 2021-05-06 12:50:51 -07:00 · 2021-05-06 12:50:51 -07:00 · 0eba8a1fe3
commit 0eba8a1fe3
parent 90e9a33e1c
1 changed files with 69 additions and 47 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -42,58 +42,84 @@ class StreamInterface {
  virtual unsigned int* semaphore() const = 0;
 };

-EIGEN_STRONG_INLINE gpuDeviceProp_t*& getDeviceProperties() {
-  static gpuDeviceProp_t* deviceProperties;
-  return deviceProperties;
-}
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }

-EIGEN_STRONG_INLINE bool& getDevicePropInitialized() {
-  static bool devicePropInitialized = false;
-  return devicePropInitialized;
-}
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }

-static void initializeDeviceProp() {
-  if (!getDevicePropInitialized()) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-      // We're the first thread to reach this point.
-      int num_devices;
-      gpuError_t status = gpuGetDeviceCount(&num_devices);
-      if (status != gpuSuccess) {
-        std::cerr << "Failed to get the number of GPU devices: "
-                  << gpuGetErrorString(status)
-                  << std::endl;
-        gpu_assert(status == gpuSuccess);
-      }
-      getDeviceProperties() = new gpuDeviceProp_t[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = gpuGetDeviceProperties(&getDeviceProperties()[i], i);
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
        if (status != gpuSuccess) {
-          std::cerr << "Failed to initialize GPU device #"
-                    << i
-                    << ": "
+          std::cerr << "Failed to get the number of GPU devices: "
                    << gpuGetErrorString(status)
                    << std::endl;
          gpu_assert(status == gpuSuccess);
        }
-      }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }

-      std::atomic_thread_fence(std::memory_order_release);
-      getDevicePropInitialized() = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!getDevicePropInitialized()) {
-        std::atomic_thread_fence(std::memory_order_acquire);
-        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
      }
    }
  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
 }

 static const gpuStream_t default_stream = gpuStreamDefault;
@ -103,12 +129,9 @@ class GpuStreamDevice : public StreamInterface {
  // Use the default stream on the current device
  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
    gpuGetDevice(&device_);
-    initializeDeviceProp();
  }
  // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
  // Use the specified stream. Note that it's the
  // caller responsibility to ensure that the stream can run on
  // the specified device. If no device is specified the code
@ -125,7 +148,6 @@ class GpuStreamDevice : public StreamInterface {
      gpu_assert(device < num_devices);
      device_ = device;
    }
-    initializeDeviceProp();
  }

  virtual ~GpuStreamDevice() {
@ -136,7 +158,7 @@ class GpuStreamDevice : public StreamInterface {

  const gpuStream_t& stream() const { return *stream_; }
  const gpuDeviceProp_t& deviceProperties() const {
-    return getDeviceProperties()[device_];
+    return GetGpuDeviceProperties(device_);
  }
  virtual void* allocate(size_t num_bytes) const {
    gpuError_t err = gpuSetDevice(device_);