Slightly simplify ForkJoin code, and make sure the test is actually run.

2025-09-10 16:33:15 +08:00 · 2025-02-25 17:22:43 +00:00 · 2025-02-25 17:22:43 +00:00 · 72adf891d5
commit 72adf891d5
parent 6aebfa9acc
4 changed files with 69 additions and 88 deletions
--- a/Eigen/src/ThreadPool/ForkJoin.h
+++ b/Eigen/src/ThreadPool/ForkJoin.h
@ -31,7 +31,7 @@ namespace Eigen {
 // where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
 // task function `g(k)`, the same operation is applied with
 //
-//   f(i,j) = [&](){ for(int k = i; k < j; ++k) g(k); };
+//   f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
 //
 // Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
 // given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
@ -45,51 +45,50 @@ namespace Eigen {
 // ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
 // ```
 //
-// Example usage #2 (asynchronous):
+// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
 // ```
 // ThreadPool thread_pool(num_threads);
-// Barrier barrier(num_tasks * num_async_calls);
+// Barrier barrier(num_async_calls);
-// auto done = [&](){barrier.Notify();};
+// auto done = [&](){ barrier.Notify(); };
-// for (int k=0; k<num_async_calls; ++k) {
+// for (Index k=0; k<num_async_calls; ++k) {
-//   thread_pool.Schedule([&](){
+//   ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
-//     ForkJoinScheduler::ParallelForAsync(0, num_tasks, granularity, parallel_task, done, &thread_pool);
+//   &thread_pool);
 //   });
 // }
 // barrier.Wait();
 // ```
 class ForkJoinScheduler {
 public:
-  // Runs `do_func` asynchronously for the range [start, end) with a specified granularity. `do_func` should
+  // Runs `do_func` asynchronously for the range [start, end) with a specified
-  // either be of type `std::function<void(int)>` or `std::function<void(int, int)`.
+  // granularity. `do_func` should be of type `std::function<void(Index,
-  // If `end > start`, the `done` callback will be called `end - start` times when all tasks have been
+  // Index)`. `done()` is called exactly once after all tasks have been executed.
-  // executed. Otherwise, `done` is called only once.
+  template <typename DoFnType, typename DoneFnType>
-  template <typename DoFnType>
+  static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
-  static void ParallelForAsync(int start, int end, int granularity, DoFnType do_func, std::function<void()> done,
+                               ThreadPool* thread_pool) {
                               Eigen::ThreadPool* thread_pool) {
    if (start >= end) {
      done();
      return;
    }
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, do_func, done, thread_pool);
+    thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
                           done = std::forward<DoneFnType>(done)]() {
      RunParallelFor(start, end, granularity, do_func, thread_pool);
      done();
    });
  }
  // Synchronous variant of ParallelForAsync.
  template <typename DoFnType>
-  static void ParallelFor(int start, int end, int granularity, DoFnType do_func, Eigen::ThreadPool* thread_pool) {
+  static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
    if (start >= end) return;
    auto dummy_done = []() {};
    Barrier barrier(1);
-    thread_pool->Schedule([start, end, granularity, thread_pool, &do_func, &dummy_done, &barrier]() {
+    auto done = [&barrier]() { barrier.Notify(); };
-      ForkJoinScheduler::ParallelForAsync(start, end, granularity, do_func, dummy_done, thread_pool);
+    ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
      barrier.Notify();
    });
    barrier.Wait();
  }
 private:
  // Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
  template <typename LeftType, typename RightType>
-  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, Eigen::ThreadPool* thread_pool) {
+  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPool* thread_pool) {
    std::atomic<bool> right_done(false);
    auto execute_right = [&right_thunk, &right_done]() {
      std::forward<RightType>(right_thunk)();
@ -97,47 +96,38 @@ class ForkJoinScheduler {
    };
    thread_pool->Schedule(execute_right);
    std::forward<LeftType>(left_thunk)();
-    Eigen::ThreadPool::Task task;
+    ThreadPool::Task task;
    while (!right_done.load(std::memory_order_acquire)) {
      thread_pool->MaybeGetTask(&task);
      if (task.f) task.f();
    }
  }
-  // Runs `do_func` in parallel for the range [start, end). The main recursive asynchronous runner that
+  static Index ComputeMidpoint(Index start, Index end, Index granularity) {
-  // calls `ForkJoin`.
+    // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int)>& do_func,
+    // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
+    // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
-    std::function<void(int, int)> wrapped_do_func = [&do_func](int start, int end) {
+    // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
-      for (int i = start; i < end; ++i) do_func(i);
+    // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
-    };
+    const Index size = end - start;
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, wrapped_do_func, done, thread_pool);
+    const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
    return start + offset;
  }
-  // Variant of `RunAsyncParallelFor` that uses a do function that operates on an index range.
+  template <typename DoFnType>
-  // Specifically, `do_func` takes two arguments: the start and end of the range.
+  static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int, int)>& do_func,
+    Index mid = ComputeMidpoint(start, end, granularity);
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
+    if ((end - start) < granularity || mid == start || mid == end) {
    if ((end - start) <= granularity) {
      do_func(start, end);
-      for (int j = 0; j < end - start; ++j) done();
+      return;
    } else {
      // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
      // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
      // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
      // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
      // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
      const int size = end - start;
      const int mid = start + 9 * (size + 1) / 16;
      ForkJoinScheduler::ForkJoin(
          [start, mid, granularity, &do_func, &done, thread_pool]() {
            RunParallelForAsync(start, mid, granularity, do_func, done, thread_pool);
          },
          [mid, end, granularity, &do_func, &done, thread_pool]() {
            RunParallelForAsync(mid, end, granularity, do_func, done, thread_pool);
          },
          thread_pool);
    }
    ForkJoin([start, mid, granularity, &do_func, thread_pool]() {
               RunParallelFor(start, mid, granularity, do_func, thread_pool);
             },
             [mid, end, granularity, &do_func, thread_pool]() {
               RunParallelFor(mid, end, granularity, do_func, thread_pool);
             },
             thread_pool);
  }
 };
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@ -156,7 +156,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  // Tries to assign work to the current task.
  void MaybeGetTask(Task* t) {
    PerThread* pt = GetPerThread();
-    Queue& q = thread_data_[pt->thread_id].queue;
+    const int thread_id = pt->thread_id;
    // If we are not a worker thread of this pool, we can't get any work.
    if (thread_id < 0) return;
    Queue& q = thread_data_[thread_id].queue;
    *t = q.PopFront();
    if (t->f) return;
    if (num_threads_ == 1) {
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -320,6 +320,7 @@ ei_add_test(tuple_test)
 ei_add_test(threads_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(threads_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(threads_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 ei_add_test(threads_fork_join "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
 check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
--- a/test/threads_fork_join.cpp
+++ b/test/threads_fork_join.cpp
@ -12,39 +12,26 @@
 #include "Eigen/ThreadPool"
 struct TestData {
-  ThreadPool tp;
+  std::unique_ptr<ThreadPool> tp;
  std::vector<double> data;
 };
 TestData make_test_data(int num_threads, int num_shards) {
-  return {ThreadPool(num_threads), std::vector<double>(num_shards, 1.0)};
+  return {std::make_unique<ThreadPool>(num_threads), std::vector<double>(num_shards, 1.0)};
 }
-static void test_unary_parallel_for(int granularity) {
+static void test_parallel_for(int granularity) {
  // Test correctness.
  const int kNumTasks = 100000;
  TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
-  std::atomic<double> sum = 0.0;
+  std::atomic<uint64_t> sum(0);
-  std::function<void(int)> unary_do_fn = [&](int i) {
+  std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
    for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
    };
  };
  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(unary_do_fn), &test_data.tp);
  VERIFY_IS_EQUAL(sum, kNumTasks);
 }
 static void test_binary_parallel_for(int granularity) {
  // Test correctness.
  const int kNumTasks = 100000;
  TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
  std::atomic<double> sum = 0.0;
  std::function<void(int, int)> binary_do_fn = [&](int i, int j) {
    for (int k = i; k < j; ++k)
-      for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
+      for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
      };
  };
-  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), &test_data.tp);
+  ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), test_data.tp.get());
-  VERIFY_IS_EQUAL(sum, kNumTasks);
+  VERIFY_IS_EQUAL(sum.load(), kNumTasks);
 }
 static void test_async_parallel_for() {
@ -54,26 +41,26 @@ static void test_async_parallel_for() {
  const int kNumTasks = 100;
  const int kNumAsyncCalls = kNumThreads * 4;
  TestData test_data = make_test_data(kNumThreads, kNumTasks);
-  std::atomic<double> sum = 0.0;
+  std::atomic<uint64_t> sum(0);
-  std::function<void(int)> unary_do_fn = [&](int i) {
+  std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
-    for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
+    for (Index k = i; k < j; ++k) {
-    };
+      for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
      }
    }
  };
-  Barrier barrier(kNumTasks * kNumAsyncCalls);
+  Barrier barrier(kNumAsyncCalls);
  std::function<void()> done = [&]() { barrier.Notify(); };
  for (int k = 0; k < kNumAsyncCalls; ++k) {
-    test_data.tp.Schedule([&]() {
+    test_data.tp->Schedule([&]() {
-      ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, unary_do_fn, done, &test_data.tp);
+      ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, binary_do_fn, done, test_data.tp.get());
    });
  }
  barrier.Wait();
-  VERIFY_IS_EQUAL(sum, kNumTasks * kNumAsyncCalls);
+  VERIFY_IS_EQUAL(sum.load(), kNumTasks * kNumAsyncCalls);
 }
 EIGEN_DECLARE_TEST(fork_join) {
-  CALL_SUBTEST(test_unary_parallel_for(1));
+  CALL_SUBTEST(test_parallel_for(1));
-  CALL_SUBTEST(test_unary_parallel_for(2));
+  CALL_SUBTEST(test_parallel_for(2));
  CALL_SUBTEST(test_binary_parallel_for(1));
  CALL_SUBTEST(test_binary_parallel_for(2));
  CALL_SUBTEST(test_async_parallel_for());
 }