Avoid a division in NonBlockingThreadPool::Steal.

Looking at profiles we spend ~10-20% of Steal on simply computing random % size. We can reduce random 32-bit int into [0, size) range with a single multiplication and shift. This transformation is described in https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
2025-10-09 22:56:31 +08:00 · 2020-02-14 16:02:57 -05:00 · 2020-02-14 16:02:57 -05:00 · eb6cc29583
commit eb6cc29583
parent 7769600245
1 changed files with 6 additions and 2 deletions
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@ -335,8 +335,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    PerThread* pt = GetPerThread();
    const size_t size = limit - start;
    unsigned r = Rand(&pt->rand);
-    unsigned victim = r % size;
+    // Reduce r into [0, size) range, this utilizes trick from
-    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
    unsigned inc = all_coprimes_[size - 1][index];
    for (unsigned i = 0; i < size; i++) {
      eigen_plain_assert(start + victim < limit);