Avoid atomic false sharing in RunQueue

This commit is contained in:
Eugene Zhulenev 2024-08-01 17:41:16 +00:00 committed by Rasmus Munk Larsen
parent 0b646f3f36
commit fd98cc49f1
3 changed files with 47 additions and 31 deletions

View File

@ -38,6 +38,15 @@
#define EIGEN_ALIGNOF(x) alignof(x)
#endif
// Align to the boundary that avoids false sharing.
// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
#ifdef __cpp_lib_hardware_interference_size
#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
#else
// Overalign for the cache line size of 128 bytes (Apple M1)
#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
#endif
// If the user explicitly disable vectorization, then we also disable alignment
#if defined(EIGEN_DONT_VECTORIZE)
#if defined(EIGEN_GPUCC)

View File

@ -57,6 +57,9 @@ class EventCount {
eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
}
EventCount(const EventCount&) = delete;
void operator=(const EventCount&) = delete;
~EventCount() {
// Ensure there are no waiters.
eigen_plain_assert(state_.load() == kStackMask);
@ -155,22 +158,6 @@ class EventCount {
}
}
class Waiter {
friend class EventCount;
// Align to 128 byte boundary to prevent false sharing with other Waiter
// objects in the same vector.
EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
EIGEN_MUTEX mu;
EIGEN_CONDVAR cv;
uint64_t epoch = 0;
unsigned state = kNotSignaled;
enum {
kNotSignaled,
kWaiting,
kSignaled,
};
};
private:
// State_ layout:
// - low kWaiterBits is a stack of waiters committed wait
@ -192,9 +179,25 @@ class EventCount {
static const uint64_t kEpochBits = 64 - kEpochShift;
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
std::atomic<uint64_t> state_;
MaxSizeVector<Waiter>& waiters_;
public:
class Waiter {
friend class EventCount;
enum State {
kNotSignaled,
kWaiting,
kSignaled,
};
EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
EIGEN_MUTEX mu;
EIGEN_CONDVAR cv;
uint64_t epoch{0};
unsigned state{kNotSignaled};
};
private:
static void CheckState(uint64_t state, bool waiter = false) {
static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
@ -229,8 +232,8 @@ class EventCount {
}
}
EventCount(const EventCount&) = delete;
void operator=(const EventCount&) = delete;
std::atomic<uint64_t> state_;
MaxSizeVector<Waiter>& waiters_;
};
} // namespace Eigen

View File

@ -154,16 +154,18 @@ class RunQueue {
private:
static const unsigned kMask = kSize - 1;
static const unsigned kMask2 = (kSize << 1) - 1;
struct Elem {
std::atomic<uint8_t> state;
Work w;
};
enum {
enum State {
kEmpty,
kBusy,
kReady,
};
EIGEN_MUTEX mutex_;
struct Elem {
std::atomic<uint8_t> state;
Work w;
};
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
// front/back, respectively. The remaining bits contain modification counters
// that are incremented on Push operations. This allows us to (1) distinguish
@ -171,9 +173,11 @@ class RunQueue {
// position, these conditions would be indistinguishable); (2) obtain
// consistent snapshot of front_/back_ for Size operation using the
// modification counters.
std::atomic<unsigned> front_;
std::atomic<unsigned> back_;
Elem array_[kSize];
EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
EIGEN_MUTEX mutex_; // guards `PushBack` and `PopBack` (accesses `back_`)
EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
// SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
// only whether the size is 0 is guaranteed to be correct.
@ -208,12 +212,12 @@ class RunQueue {
EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
int size = (front & kMask2) - (back & kMask2);
// Fix overflow.
if (size < 0) size += 2 * kSize;
if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
// Order of modification in push/pop is crafted to make the queue look
// larger than it is during concurrent modifications. E.g. push can
// increment size before the corresponding pop has decremented it.
// So the computed size can be up to kSize + 1, fix it.
if (size > static_cast<int>(kSize)) size = kSize;
if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
return static_cast<unsigned>(size);
}