mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 02:33:59 +08:00
Slightly simplify ForkJoin code, and make sure the test is actually run.
This commit is contained in:
parent
6aebfa9acc
commit
72adf891d5
@ -31,7 +31,7 @@ namespace Eigen {
|
|||||||
// where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
|
// where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
|
||||||
// task function `g(k)`, the same operation is applied with
|
// task function `g(k)`, the same operation is applied with
|
||||||
//
|
//
|
||||||
// f(i,j) = [&](){ for(int k = i; k < j; ++k) g(k); };
|
// f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
|
||||||
//
|
//
|
||||||
// Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
|
// Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
|
||||||
// given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
|
// given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
|
||||||
@ -45,51 +45,50 @@ namespace Eigen {
|
|||||||
// ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
|
// ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
|
||||||
// ```
|
// ```
|
||||||
//
|
//
|
||||||
// Example usage #2 (asynchronous):
|
// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
|
||||||
// ```
|
// ```
|
||||||
// ThreadPool thread_pool(num_threads);
|
// ThreadPool thread_pool(num_threads);
|
||||||
// Barrier barrier(num_tasks * num_async_calls);
|
// Barrier barrier(num_async_calls);
|
||||||
// auto done = [&](){barrier.Notify();};
|
// auto done = [&](){ barrier.Notify(); };
|
||||||
// for (int k=0; k<num_async_calls; ++k) {
|
// for (Index k=0; k<num_async_calls; ++k) {
|
||||||
// thread_pool.Schedule([&](){
|
// ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
|
||||||
// ForkJoinScheduler::ParallelForAsync(0, num_tasks, granularity, parallel_task, done, &thread_pool);
|
// &thread_pool);
|
||||||
// });
|
|
||||||
// }
|
// }
|
||||||
// barrier.Wait();
|
// barrier.Wait();
|
||||||
// ```
|
// ```
|
||||||
class ForkJoinScheduler {
|
class ForkJoinScheduler {
|
||||||
public:
|
public:
|
||||||
// Runs `do_func` asynchronously for the range [start, end) with a specified granularity. `do_func` should
|
// Runs `do_func` asynchronously for the range [start, end) with a specified
|
||||||
// either be of type `std::function<void(int)>` or `std::function<void(int, int)`.
|
// granularity. `do_func` should be of type `std::function<void(Index,
|
||||||
// If `end > start`, the `done` callback will be called `end - start` times when all tasks have been
|
// Index)`. `done()` is called exactly once after all tasks have been executed.
|
||||||
// executed. Otherwise, `done` is called only once.
|
template <typename DoFnType, typename DoneFnType>
|
||||||
template <typename DoFnType>
|
static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
|
||||||
static void ParallelForAsync(int start, int end, int granularity, DoFnType do_func, std::function<void()> done,
|
ThreadPool* thread_pool) {
|
||||||
Eigen::ThreadPool* thread_pool) {
|
|
||||||
if (start >= end) {
|
if (start >= end) {
|
||||||
done();
|
done();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ForkJoinScheduler::RunParallelForAsync(start, end, granularity, do_func, done, thread_pool);
|
thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
|
||||||
|
done = std::forward<DoneFnType>(done)]() {
|
||||||
|
RunParallelFor(start, end, granularity, do_func, thread_pool);
|
||||||
|
done();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronous variant of ParallelForAsync.
|
// Synchronous variant of ParallelForAsync.
|
||||||
template <typename DoFnType>
|
template <typename DoFnType>
|
||||||
static void ParallelFor(int start, int end, int granularity, DoFnType do_func, Eigen::ThreadPool* thread_pool) {
|
static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
|
||||||
if (start >= end) return;
|
if (start >= end) return;
|
||||||
auto dummy_done = []() {};
|
|
||||||
Barrier barrier(1);
|
Barrier barrier(1);
|
||||||
thread_pool->Schedule([start, end, granularity, thread_pool, &do_func, &dummy_done, &barrier]() {
|
auto done = [&barrier]() { barrier.Notify(); };
|
||||||
ForkJoinScheduler::ParallelForAsync(start, end, granularity, do_func, dummy_done, thread_pool);
|
ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
|
||||||
barrier.Notify();
|
|
||||||
});
|
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
|
// Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
|
||||||
template <typename LeftType, typename RightType>
|
template <typename LeftType, typename RightType>
|
||||||
static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, Eigen::ThreadPool* thread_pool) {
|
static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPool* thread_pool) {
|
||||||
std::atomic<bool> right_done(false);
|
std::atomic<bool> right_done(false);
|
||||||
auto execute_right = [&right_thunk, &right_done]() {
|
auto execute_right = [&right_thunk, &right_done]() {
|
||||||
std::forward<RightType>(right_thunk)();
|
std::forward<RightType>(right_thunk)();
|
||||||
@ -97,47 +96,38 @@ class ForkJoinScheduler {
|
|||||||
};
|
};
|
||||||
thread_pool->Schedule(execute_right);
|
thread_pool->Schedule(execute_right);
|
||||||
std::forward<LeftType>(left_thunk)();
|
std::forward<LeftType>(left_thunk)();
|
||||||
Eigen::ThreadPool::Task task;
|
ThreadPool::Task task;
|
||||||
while (!right_done.load(std::memory_order_acquire)) {
|
while (!right_done.load(std::memory_order_acquire)) {
|
||||||
thread_pool->MaybeGetTask(&task);
|
thread_pool->MaybeGetTask(&task);
|
||||||
if (task.f) task.f();
|
if (task.f) task.f();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Runs `do_func` in parallel for the range [start, end). The main recursive asynchronous runner that
|
static Index ComputeMidpoint(Index start, Index end, Index granularity) {
|
||||||
// calls `ForkJoin`.
|
// Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
|
||||||
static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int)>& do_func,
|
// `granularity` are powers of two. Since modern processors usually implement (2^x)-way
|
||||||
std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
|
// set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
|
||||||
std::function<void(int, int)> wrapped_do_func = [&do_func](int start, int end) {
|
// powers of two (to avoid having two addresses in the main memory pointing to the same point in the
|
||||||
for (int i = start; i < end; ++i) do_func(i);
|
// cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
|
||||||
};
|
const Index size = end - start;
|
||||||
ForkJoinScheduler::RunParallelForAsync(start, end, granularity, wrapped_do_func, done, thread_pool);
|
const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
|
||||||
|
return start + offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Variant of `RunAsyncParallelFor` that uses a do function that operates on an index range.
|
template <typename DoFnType>
|
||||||
// Specifically, `do_func` takes two arguments: the start and end of the range.
|
static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
|
||||||
static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int, int)>& do_func,
|
Index mid = ComputeMidpoint(start, end, granularity);
|
||||||
std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
|
if ((end - start) < granularity || mid == start || mid == end) {
|
||||||
if ((end - start) <= granularity) {
|
|
||||||
do_func(start, end);
|
do_func(start, end);
|
||||||
for (int j = 0; j < end - start; ++j) done();
|
return;
|
||||||
} else {
|
|
||||||
// Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
|
|
||||||
// `granularity` are powers of two. Since modern processors usually implement (2^x)-way
|
|
||||||
// set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
|
|
||||||
// powers of two (to avoid having two addresses in the main memory pointing to the same point in the
|
|
||||||
// cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
|
|
||||||
const int size = end - start;
|
|
||||||
const int mid = start + 9 * (size + 1) / 16;
|
|
||||||
ForkJoinScheduler::ForkJoin(
|
|
||||||
[start, mid, granularity, &do_func, &done, thread_pool]() {
|
|
||||||
RunParallelForAsync(start, mid, granularity, do_func, done, thread_pool);
|
|
||||||
},
|
|
||||||
[mid, end, granularity, &do_func, &done, thread_pool]() {
|
|
||||||
RunParallelForAsync(mid, end, granularity, do_func, done, thread_pool);
|
|
||||||
},
|
|
||||||
thread_pool);
|
|
||||||
}
|
}
|
||||||
|
ForkJoin([start, mid, granularity, &do_func, thread_pool]() {
|
||||||
|
RunParallelFor(start, mid, granularity, do_func, thread_pool);
|
||||||
|
},
|
||||||
|
[mid, end, granularity, &do_func, thread_pool]() {
|
||||||
|
RunParallelFor(mid, end, granularity, do_func, thread_pool);
|
||||||
|
},
|
||||||
|
thread_pool);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -156,7 +156,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
|||||||
// Tries to assign work to the current task.
|
// Tries to assign work to the current task.
|
||||||
void MaybeGetTask(Task* t) {
|
void MaybeGetTask(Task* t) {
|
||||||
PerThread* pt = GetPerThread();
|
PerThread* pt = GetPerThread();
|
||||||
Queue& q = thread_data_[pt->thread_id].queue;
|
const int thread_id = pt->thread_id;
|
||||||
|
// If we are not a worker thread of this pool, we can't get any work.
|
||||||
|
if (thread_id < 0) return;
|
||||||
|
Queue& q = thread_data_[thread_id].queue;
|
||||||
*t = q.PopFront();
|
*t = q.PopFront();
|
||||||
if (t->f) return;
|
if (t->f) return;
|
||||||
if (num_threads_ == 1) {
|
if (num_threads_ == 1) {
|
||||||
|
@ -320,6 +320,7 @@ ei_add_test(tuple_test)
|
|||||||
ei_add_test(threads_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(threads_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(threads_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(threads_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(threads_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(threads_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
|
ei_add_test(threads_fork_join "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
|
add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
|
||||||
|
|
||||||
check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
|
check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
|
||||||
|
@ -12,39 +12,26 @@
|
|||||||
#include "Eigen/ThreadPool"
|
#include "Eigen/ThreadPool"
|
||||||
|
|
||||||
struct TestData {
|
struct TestData {
|
||||||
ThreadPool tp;
|
std::unique_ptr<ThreadPool> tp;
|
||||||
std::vector<double> data;
|
std::vector<double> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
TestData make_test_data(int num_threads, int num_shards) {
|
TestData make_test_data(int num_threads, int num_shards) {
|
||||||
return {ThreadPool(num_threads), std::vector<double>(num_shards, 1.0)};
|
return {std::make_unique<ThreadPool>(num_threads), std::vector<double>(num_shards, 1.0)};
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_unary_parallel_for(int granularity) {
|
static void test_parallel_for(int granularity) {
|
||||||
// Test correctness.
|
// Test correctness.
|
||||||
const int kNumTasks = 100000;
|
const int kNumTasks = 100000;
|
||||||
TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
|
TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
|
||||||
std::atomic<double> sum = 0.0;
|
std::atomic<uint64_t> sum(0);
|
||||||
std::function<void(int)> unary_do_fn = [&](int i) {
|
std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
|
||||||
for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
|
|
||||||
};
|
|
||||||
};
|
|
||||||
ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(unary_do_fn), &test_data.tp);
|
|
||||||
VERIFY_IS_EQUAL(sum, kNumTasks);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void test_binary_parallel_for(int granularity) {
|
|
||||||
// Test correctness.
|
|
||||||
const int kNumTasks = 100000;
|
|
||||||
TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
|
|
||||||
std::atomic<double> sum = 0.0;
|
|
||||||
std::function<void(int, int)> binary_do_fn = [&](int i, int j) {
|
|
||||||
for (int k = i; k < j; ++k)
|
for (int k = i; k < j; ++k)
|
||||||
for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
|
for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), &test_data.tp);
|
ForkJoinScheduler::ParallelFor(0, kNumTasks, granularity, std::move(binary_do_fn), test_data.tp.get());
|
||||||
VERIFY_IS_EQUAL(sum, kNumTasks);
|
VERIFY_IS_EQUAL(sum.load(), kNumTasks);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_async_parallel_for() {
|
static void test_async_parallel_for() {
|
||||||
@ -54,26 +41,26 @@ static void test_async_parallel_for() {
|
|||||||
const int kNumTasks = 100;
|
const int kNumTasks = 100;
|
||||||
const int kNumAsyncCalls = kNumThreads * 4;
|
const int kNumAsyncCalls = kNumThreads * 4;
|
||||||
TestData test_data = make_test_data(kNumThreads, kNumTasks);
|
TestData test_data = make_test_data(kNumThreads, kNumTasks);
|
||||||
std::atomic<double> sum = 0.0;
|
std::atomic<uint64_t> sum(0);
|
||||||
std::function<void(int)> unary_do_fn = [&](int i) {
|
std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
|
||||||
for (double new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
|
for (Index k = i; k < j; ++k) {
|
||||||
};
|
for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[i]);) {
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
Barrier barrier(kNumTasks * kNumAsyncCalls);
|
Barrier barrier(kNumAsyncCalls);
|
||||||
std::function<void()> done = [&]() { barrier.Notify(); };
|
std::function<void()> done = [&]() { barrier.Notify(); };
|
||||||
for (int k = 0; k < kNumAsyncCalls; ++k) {
|
for (int k = 0; k < kNumAsyncCalls; ++k) {
|
||||||
test_data.tp.Schedule([&]() {
|
test_data.tp->Schedule([&]() {
|
||||||
ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, unary_do_fn, done, &test_data.tp);
|
ForkJoinScheduler::ParallelForAsync(0, kNumTasks, /*granularity=*/1, binary_do_fn, done, test_data.tp.get());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
VERIFY_IS_EQUAL(sum, kNumTasks * kNumAsyncCalls);
|
VERIFY_IS_EQUAL(sum.load(), kNumTasks * kNumAsyncCalls);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(fork_join) {
|
EIGEN_DECLARE_TEST(fork_join) {
|
||||||
CALL_SUBTEST(test_unary_parallel_for(1));
|
CALL_SUBTEST(test_parallel_for(1));
|
||||||
CALL_SUBTEST(test_unary_parallel_for(2));
|
CALL_SUBTEST(test_parallel_for(2));
|
||||||
CALL_SUBTEST(test_binary_parallel_for(1));
|
|
||||||
CALL_SUBTEST(test_binary_parallel_for(2));
|
|
||||||
CALL_SUBTEST(test_async_parallel_for());
|
CALL_SUBTEST(test_async_parallel_for());
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user