mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-10-09 22:56:31 +08:00
Avoid a division in NonBlockingThreadPool::Steal.
Looking at profiles we spend ~10-20% of Steal on simply computing random % size. We can reduce random 32-bit int into [0, size) range with a single multiplication and shift. This transformation is described in https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
This commit is contained in:
parent
7769600245
commit
eb6cc29583
@ -335,8 +335,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
|||||||
PerThread* pt = GetPerThread();
|
PerThread* pt = GetPerThread();
|
||||||
const size_t size = limit - start;
|
const size_t size = limit - start;
|
||||||
unsigned r = Rand(&pt->rand);
|
unsigned r = Rand(&pt->rand);
|
||||||
unsigned victim = r % size;
|
// Reduce r into [0, size) range, this utilizes trick from
|
||||||
unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
|
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||||
|
eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
|
||||||
|
unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
|
||||||
|
unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
|
||||||
|
unsigned inc = all_coprimes_[size - 1][index];
|
||||||
|
|
||||||
for (unsigned i = 0; i < size; i++) {
|
for (unsigned i = 0; i < size; i++) {
|
||||||
eigen_plain_assert(start + victim < limit);
|
eigen_plain_assert(start + victim < limit);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user