Avoid atomic false sharing in RunQueue

2025-10-10 15:16:31 +08:00 · 2024-08-01 17:41:16 +00:00 · 2024-08-01 17:41:16 +00:00 · fd98cc49f1
commit fd98cc49f1
parent 0b646f3f36
3 changed files with 47 additions and 31 deletions
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@ -38,6 +38,15 @@
 #define EIGEN_ALIGNOF(x) alignof(x)
 #endif

+// Align to the boundary that avoids false sharing.
+// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+#ifdef __cpp_lib_hardware_interference_size
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
+#else
+// Overalign for the cache line size of 128 bytes (Apple M1)
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
+#endif
+
 // If the user explicitly disable vectorization, then we also disable alignment
 #if defined(EIGEN_DONT_VECTORIZE)
 #if defined(EIGEN_GPUCC)
--- a/Eigen/src/ThreadPool/EventCount.h
+++ b/Eigen/src/ThreadPool/EventCount.h
@ -57,6 +57,9 @@ class EventCount {
    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
  }

+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+
  ~EventCount() {
    // Ensure there are no waiters.
    eigen_plain_assert(state_.load() == kStackMask);
@ -155,22 +158,6 @@ class EventCount {
    }
  }

-  class Waiter {
-    friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter
-    // objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
-    EIGEN_MUTEX mu;
-    EIGEN_CONDVAR cv;
-    uint64_t epoch = 0;
-    unsigned state = kNotSignaled;
-    enum {
-      kNotSignaled,
-      kWaiting,
-      kSignaled,
-    };
-  };
-
 private:
  // State_ layout:
  // - low kWaiterBits is a stack of waiters committed wait
@ -192,9 +179,25 @@ class EventCount {
  static const uint64_t kEpochBits = 64 - kEpochShift;
  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
  static const uint64_t kEpochInc = 1ull << kEpochShift;
-  std::atomic<uint64_t> state_;
-  MaxSizeVector<Waiter>& waiters_;

+ public:
+  class Waiter {
+    friend class EventCount;
+
+    enum State {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+
+    EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
+    EIGEN_MUTEX mu;
+    EIGEN_CONDVAR cv;
+    uint64_t epoch{0};
+    unsigned state{kNotSignaled};
+  };
+
+ private:
  static void CheckState(uint64_t state, bool waiter = false) {
    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
@ -229,8 +232,8 @@ class EventCount {
    }
  }

-  EventCount(const EventCount&) = delete;
-  void operator=(const EventCount&) = delete;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
 };

 }  // namespace Eigen
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h
@ -154,16 +154,18 @@ class RunQueue {
 private:
  static const unsigned kMask = kSize - 1;
  static const unsigned kMask2 = (kSize << 1) - 1;
-  struct Elem {
-    std::atomic<uint8_t> state;
-    Work w;
-  };
-  enum {
+
+  enum State {
    kEmpty,
    kBusy,
    kReady,
  };
-  EIGEN_MUTEX mutex_;
+
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+
  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
  // front/back, respectively. The remaining bits contain modification counters
  // that are incremented on Push operations. This allows us to (1) distinguish
@ -171,9 +173,11 @@ class RunQueue {
  // position, these conditions would be indistinguishable); (2) obtain
  // consistent snapshot of front_/back_ for Size operation using the
  // modification counters.
-  std::atomic<unsigned> front_;
-  std::atomic<unsigned> back_;
-  Elem array_[kSize];
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+  EIGEN_MUTEX mutex_;  // guards `PushBack` and `PopBack` (accesses `back_`)
+
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];

  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
  // only whether the size is 0 is guaranteed to be correct.
@ -208,12 +212,12 @@ class RunQueue {
  EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
    int size = (front & kMask2) - (back & kMask2);
    // Fix overflow.
-    if (size < 0) size += 2 * kSize;
+    if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
    // Order of modification in push/pop is crafted to make the queue look
    // larger than it is during concurrent modifications. E.g. push can
    // increment size before the corresponding pop has decremented it.
    // So the computed size can be up to kSize + 1, fix it.
-    if (size > static_cast<int>(kSize)) size = kSize;
+    if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
    return static_cast<unsigned>(size);
  }