Split TensorDeviceType.h in 3 files to make it more manageable

2025-09-25 15:53:19 +08:00 · 2015-11-20 17:42:50 -08:00 · 2015-11-20 17:42:50 -08:00 · 9fa65d3838
commit 9fa65d3838
parent a367804856
4 changed files with 290 additions and 265 deletions
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -60,7 +60,9 @@
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorDeviceType.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceCuda.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
 #include "src/Tensor/TensorDimensions.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -7,272 +7,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H


 namespace Eigen {

-// Default device for the machine (typically a single cpu core)
-struct DefaultDevice {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
-    // Running on the host CPU
-    return 1;
-#else
-    // Running on a CUDA device
-    return 32;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
-    // Running single threaded on the host CPU
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-#else
-    // Running on a CUDA device
-    return __CUDA_ARCH__ / 100;
-#endif
-  }
-};
-
-
-// Multiple cpu cores
-// We should really use a thread pool here but first we need to find a portable thread pool library.
-#ifdef EIGEN_USE_THREADS
-
-// This defines an interface that ThreadPoolDevice can take to use
-// custom thread pools underneath.
-class ThreadPoolInterface {
- public:
-  virtual void Schedule(std::function<void()> fn) = 0;
-
-  virtual ~ThreadPoolInterface() {}
-};
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-class ThreadPool : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~ThreadPool()
-  {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      empty_.wait(l, [this]() { return pending_.empty(); });
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->work = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      t->join();
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(fn);
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->work = fn;
-      w->cv.notify_one();
-    }
-  }
-
- protected:
-  void WorkerLoop() {
-    std::unique_lock<std::mutex> l(mu_);
-    Waiter w;
-    while (!exiting_) {
-      std::function<void()> fn;
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        w.cv.wait(l, [&w]() { return w.ready; });
-        fn = w.work;
-        w.work = nullptr;
-      } else {
-        // Pick up pending work
-        fn = pending_.front();
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (fn) {
-        mu_.unlock();
-        fn();
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  struct Waiter {
-    std::condition_variable cv;
-    std::function<void()> work;
-    bool ready;
-  };
-
-  std::mutex mu_;
-  std::vector<std::thread*> threads_;               // All threads
-  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
-  std::deque<std::function<void()>> pending_;       // Queue of pending work
-  std::condition_variable empty_;                   // Signaled on pending_.empty()
-  bool exiting_ = false;
-};
-
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object.
-// but only one caller must call Notify() on the object.
-class Notification {
- public:
-  Notification() : notified_(false) {}
-  ~Notification() {}
-
-  void Notify() {
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void WaitForNotification() {
-    std::unique_lock<std::mutex> l(mu_);
-    cv_.wait(l, [this]() { return notified_; } );
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  bool notified_;
-};
-
-// Runs an arbitrary function and then calls Notify() on the passed in
-// Notification.
-template <typename Function, typename... Args> struct FunctionWrapper
-{
-  static void run(Notification* n, Function f, Args... args) {
-    f(args...);
-    n->Notify();
-  }
-};
-
-static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
-  if (n) {
-    n->WaitForNotification();
-  }
-}
-
-
-// Build a thread pool device on top the an existing pool of threads.
-struct ThreadPoolDevice {
-  // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
-  }
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-
-  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    return num_threads_;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-  }
-
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
-    Notification* n = new Notification();
-    std::function<void()> func =
-      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
-    pool_->Schedule(func);
-    return n;
-  }
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    std::function<void()> func = std::bind(f, args...);
-    pool_->Schedule(func);
-  }
-
- private:
-  ThreadPoolInterface* pool_;
-  size_t num_threads_;
-};
-
-#endif
-
-
-// GPU offloading
-#ifdef EIGEN_USE_GPU
-
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@ -515,8 +255,6 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
 }
 #endif

-#endif
-
 }  // end namespace Eigen

 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return __CUDA_ARCH__ / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+class ThreadPool : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit ThreadPool(int num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool()
+  {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      empty_.wait(l, [this]() { return pending_.empty(); });
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->work = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      t->join();
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) {
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(fn);
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->work = fn;
+      w->cv.notify_one();
+    }
+  }
+
+ protected:
+  void WorkerLoop() {
+    std::unique_lock<std::mutex> l(mu_);
+    Waiter w;
+    while (!exiting_) {
+      std::function<void()> fn;
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        w.cv.wait(l, [&w]() { return w.ready; });
+        fn = w.work;
+        w.work = nullptr;
+      } else {
+        // Pick up pending work
+        fn = pending_.front();
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (fn) {
+        mu_.unlock();
+        fn();
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  struct Waiter {
+    std::condition_variable cv;
+    std::function<void()> work;
+    bool ready;
+  };
+
+  std::mutex mu_;
+  std::vector<std::thread*> threads_;               // All threads
+  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
+  std::deque<std::function<void()>> pending_;       // Queue of pending work
+  std::condition_variable empty_;                   // Signaled on pending_.empty()
+  bool exiting_ = false;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+  Notification() : notified_(false) {}
+  ~Notification() {}
+
+  void Notify() {
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> l(mu_);
+    cv_.wait(l, [this]() { return notified_; } );
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
+};
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapper
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    n->Notify();
+  }
+};
+
+static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+  if (n) {
+    n->WaitForNotification();
+  }
+}
+
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    std::function<void()> func =
+      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+    pool_->Schedule(func);
+    return n;
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    std::function<void()> func = std::bind(f, args...);
+    pool_->Schedule(func);
+  }
+
+ private:
+  ThreadPoolInterface* pool_;
+  size_t num_threads_;
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H