mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Incorporate Threadpool in Eigen Core
This commit is contained in:
parent
d165c7377f
commit
99adca8b34
@ -404,6 +404,7 @@ using std::ptrdiff_t;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "src/Core/GlobalFunctions.h"
|
#include "src/Core/GlobalFunctions.h"
|
||||||
|
#include "src/Core/DeviceWrapper.h"
|
||||||
// IWYU pragma: end_exports
|
// IWYU pragma: end_exports
|
||||||
|
|
||||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||||
|
@ -71,6 +71,7 @@
|
|||||||
#include "src/ThreadPool/ThreadEnvironment.h"
|
#include "src/ThreadPool/ThreadEnvironment.h"
|
||||||
#include "src/ThreadPool/Barrier.h"
|
#include "src/ThreadPool/Barrier.h"
|
||||||
#include "src/ThreadPool/NonBlockingThreadPool.h"
|
#include "src/ThreadPool/NonBlockingThreadPool.h"
|
||||||
|
#include "src/ThreadPool/CoreThreadPoolDevice.h"
|
||||||
// IWYU pragma: end_exports
|
// IWYU pragma: end_exports
|
||||||
|
|
||||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||||
|
155
Eigen/src/Core/DeviceWrapper.h
Normal file
155
Eigen/src/Core/DeviceWrapper.h
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_DEVICEWRAPPER_H
|
||||||
|
#define EIGEN_DEVICEWRAPPER_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
template <typename Derived, typename Device>
|
||||||
|
struct DeviceWrapper {
|
||||||
|
using Base = EigenBase<internal::remove_all_t<Derived>>;
|
||||||
|
using Scalar = typename Derived::Scalar;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC DeviceWrapper(Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
|
||||||
|
EIGEN_DEVICE_FUNC DeviceWrapper(const Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
|
||||||
|
|
||||||
|
template <typename OtherDerived>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived>& other) {
|
||||||
|
using AssignOp = internal::assign_op<Scalar, typename OtherDerived::Scalar>;
|
||||||
|
internal::call_assignment(*this, other.derived(), AssignOp());
|
||||||
|
return m_xpr;
|
||||||
|
}
|
||||||
|
template <typename OtherDerived>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const EigenBase<OtherDerived>& other) {
|
||||||
|
using AddAssignOp = internal::add_assign_op<Scalar, typename OtherDerived::Scalar>;
|
||||||
|
internal::call_assignment(*this, other.derived(), AddAssignOp());
|
||||||
|
return m_xpr;
|
||||||
|
}
|
||||||
|
template <typename OtherDerived>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const EigenBase<OtherDerived>& other) {
|
||||||
|
using SubAssignOp = internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>;
|
||||||
|
internal::call_assignment(*this, other.derived(), SubAssignOp());
|
||||||
|
return m_xpr;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return m_xpr; }
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Device& device() { return m_device; }
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NoAlias<DeviceWrapper, EigenBase> noalias() {
|
||||||
|
return NoAlias<DeviceWrapper, EigenBase>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
Derived& m_xpr;
|
||||||
|
Device& m_device;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// this is where we differentiate between lazy assignment and specialized kernels (e.g. matrix products)
|
||||||
|
template <typename DstXprType, typename SrcXprType, typename Functor, typename Device,
|
||||||
|
typename Kind = typename AssignmentKind<typename evaluator_traits<DstXprType>::Shape,
|
||||||
|
typename evaluator_traits<SrcXprType>::Shape>::Kind,
|
||||||
|
typename EnableIf = void>
|
||||||
|
struct AssignmentWithDevice;
|
||||||
|
|
||||||
|
// unless otherwise specified, use the default product implementation
|
||||||
|
template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Functor, typename Device,
|
||||||
|
typename Weak>
|
||||||
|
struct AssignmentWithDevice<DstXprType, Product<Lhs, Rhs, Options>, Functor, Device, Dense2Dense, Weak> {
|
||||||
|
using SrcXprType = Product<Lhs, Rhs, Options>;
|
||||||
|
using Base = Assignment<DstXprType, SrcXprType, Functor>;
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
|
||||||
|
Device&) {
|
||||||
|
Base::run(dst, src, func);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// specialization for coeffcient-wise assignment
|
||||||
|
template <typename DstXprType, typename SrcXprType, typename Functor, typename Device, typename Weak>
|
||||||
|
struct AssignmentWithDevice<DstXprType, SrcXprType, Functor, Device, Dense2Dense, Weak> {
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
|
||||||
|
Device& device) {
|
||||||
|
#ifndef EIGEN_NO_DEBUG
|
||||||
|
internal::check_for_aliasing(dst, src);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
call_dense_assignment_loop(dst, src, func, device);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// this allows us to use the default evaulation scheme if it is not specialized for the device
|
||||||
|
template <typename Kernel, typename Device, int Traversal = Kernel::AssignmentTraits::Traversal,
|
||||||
|
int Unrolling = Kernel::AssignmentTraits::Unrolling>
|
||||||
|
struct dense_assignment_loop_with_device {
|
||||||
|
using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Device&) { Base::run(kernel); }
|
||||||
|
};
|
||||||
|
|
||||||
|
// entry point for a generic expression with device
|
||||||
|
template <typename Dst, typename Src, typename Func, typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
|
||||||
|
const Src& src, const Func& func) {
|
||||||
|
enum {
|
||||||
|
NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
|
||||||
|
(int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
|
||||||
|
int(Dst::SizeAtCompileTime) != 1
|
||||||
|
};
|
||||||
|
|
||||||
|
using ActualDstTypeCleaned = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst>;
|
||||||
|
using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
|
||||||
|
ActualDstType actualDst(dst.derived());
|
||||||
|
|
||||||
|
// TODO check whether this is the right place to perform these checks:
|
||||||
|
EIGEN_STATIC_ASSERT_LVALUE(Dst)
|
||||||
|
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
|
||||||
|
EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
|
||||||
|
|
||||||
|
// this provides a mechanism for specializing simple assignments, matrix products, etc
|
||||||
|
AssignmentWithDevice<ActualDstTypeCleaned, Src, Func, Device>::run(actualDst, src, func, dst.device());
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy and pasted from AssignEvaluator except forward device to kernel
|
||||||
|
template <typename DstXprType, typename SrcXprType, typename Functor, typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
|
||||||
|
const SrcXprType& src,
|
||||||
|
const Functor& func,
|
||||||
|
Device& device) {
|
||||||
|
using DstEvaluatorType = evaluator<DstXprType>;
|
||||||
|
using SrcEvaluatorType = evaluator<SrcXprType>;
|
||||||
|
|
||||||
|
SrcEvaluatorType srcEvaluator(src);
|
||||||
|
|
||||||
|
// NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
|
||||||
|
// we need to resize the destination after the source evaluator has been created.
|
||||||
|
resize_if_allowed(dst, src, func);
|
||||||
|
|
||||||
|
DstEvaluatorType dstEvaluator(dst);
|
||||||
|
|
||||||
|
using Kernel = generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor>;
|
||||||
|
|
||||||
|
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
|
||||||
|
|
||||||
|
dense_assignment_loop_with_device<Kernel, Device>::run(kernel, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
template <typename Derived>
|
||||||
|
template <typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> EigenBase<Derived>::device(Device& device) {
|
||||||
|
return DeviceWrapper<Derived, Device>(derived(), device);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Derived>
|
||||||
|
template <typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> EigenBase<Derived>::device(
|
||||||
|
Device& device) const {
|
||||||
|
return DeviceWrapper<const Derived, Device>(derived(), device);
|
||||||
|
}
|
||||||
|
} // namespace Eigen
|
||||||
|
#endif
|
@ -104,6 +104,11 @@ struct EigenBase {
|
|||||||
// derived class can reimplement it in a more optimized way.
|
// derived class can reimplement it in a more optimized way.
|
||||||
dst = this->derived() * dst;
|
dst = this->derived() * dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> device(Device& device);
|
||||||
|
template <typename Device>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> device(Device& device) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
|
@ -1249,20 +1249,40 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double trunc(const double& x) {
|
|||||||
// T is assumed to be an integer type with a>=0, and b>0
|
// T is assumed to be an integer type with a>=0, and b>0
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) {
|
||||||
|
using UnsignedT = typename internal::make_unsigned<T>::type;
|
||||||
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
|
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
|
||||||
eigen_assert(a >= 0);
|
eigen_assert(a >= 0);
|
||||||
eigen_assert(b > 0);
|
eigen_assert(b > 0);
|
||||||
|
// Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
|
||||||
|
const UnsignedT ua = UnsignedT(a);
|
||||||
|
const UnsignedT ub = UnsignedT(b);
|
||||||
// Note: This form is used because it cannot overflow.
|
// Note: This form is used because it cannot overflow.
|
||||||
return a == 0 ? 0 : (a - 1) / b + 1;
|
return ua == 0 ? 0 : (ua - 1) / ub + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Integer round down to nearest power of b
|
||||||
|
// T is assumed to be an integer type with a>=0, and b>0
|
||||||
|
template <typename T, typename U>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b) {
|
||||||
|
using UnsignedT = typename internal::make_unsigned<T>::type;
|
||||||
|
using UnsignedU = typename internal::make_unsigned<U>::type;
|
||||||
|
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
|
||||||
|
EIGEN_STATIC_ASSERT((NumTraits<U>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
|
||||||
|
eigen_assert(a >= 0);
|
||||||
|
eigen_assert(b > 0);
|
||||||
|
// Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
|
||||||
|
const UnsignedT ua = UnsignedT(a);
|
||||||
|
const UnsignedU ub = UnsignedU(b);
|
||||||
|
return ub * (ua / ub);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Log base 2 for 32 bits positive integers.
|
/** Log base 2 for 32 bits positive integers.
|
||||||
* Conveniently returns 0 for x==0. */
|
* Conveniently returns 0 for x==0. */
|
||||||
inline int log2(int x) {
|
EIGEN_CONSTEXPR inline int log2(int x) {
|
||||||
eigen_assert(x >= 0);
|
eigen_assert(x >= 0);
|
||||||
unsigned int v(x);
|
unsigned int v(x);
|
||||||
static const int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
constexpr int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
||||||
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
|
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
|
||||||
v |= v >> 1;
|
v |= v >> 1;
|
||||||
v |= v >> 2;
|
v |= v >> 2;
|
||||||
v |= v >> 4;
|
v |= v >> 4;
|
||||||
|
@ -502,6 +502,9 @@ struct stem_function {
|
|||||||
};
|
};
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
|
||||||
|
template <typename XprType, typename Device>
|
||||||
|
struct DeviceWrapper;
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_FORWARDDECLARATIONS_H
|
#endif // EIGEN_FORWARDDECLARATIONS_H
|
||||||
|
@ -206,6 +206,64 @@ struct functor_traits {
|
|||||||
enum { Cost = 10, PacketAccess = false, IsRepeatable = false };
|
enum { Cost = 10, PacketAccess = false, IsRepeatable = false };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// estimates the cost of lazily evaluating a generic functor by unwinding the expression
|
||||||
|
template <typename Xpr>
|
||||||
|
struct nested_functor_cost {
|
||||||
|
static constexpr Index Cost = static_cast<Index>(functor_traits<Xpr>::Cost);
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
|
||||||
|
struct nested_functor_cost<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
|
||||||
|
static constexpr Index Cost = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
|
||||||
|
struct nested_functor_cost<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
|
||||||
|
static constexpr Index Cost = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: assign a cost to the stride type?
|
||||||
|
template <typename PlainObjectType, int MapOptions, typename StrideType>
|
||||||
|
struct nested_functor_cost<Map<PlainObjectType, MapOptions, StrideType>> : nested_functor_cost<PlainObjectType> {};
|
||||||
|
|
||||||
|
template <typename Func, typename Xpr>
|
||||||
|
struct nested_functor_cost<CwiseUnaryOp<Func, Xpr>> {
|
||||||
|
using XprCleaned = remove_all_t<Xpr>;
|
||||||
|
using FuncCleaned = remove_all_t<Func>;
|
||||||
|
static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Func, typename Xpr>
|
||||||
|
struct nested_functor_cost<CwiseNullaryOp<Func, Xpr>> {
|
||||||
|
using XprCleaned = remove_all_t<Xpr>;
|
||||||
|
using FuncCleaned = remove_all_t<Func>;
|
||||||
|
static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Func, typename LhsXpr, typename RhsXpr>
|
||||||
|
struct nested_functor_cost<CwiseBinaryOp<Func, LhsXpr, RhsXpr>> {
|
||||||
|
using LhsXprCleaned = remove_all_t<LhsXpr>;
|
||||||
|
using RhsXprCleaned = remove_all_t<RhsXpr>;
|
||||||
|
using FuncCleaned = remove_all_t<Func>;
|
||||||
|
static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
|
||||||
|
nested_functor_cost<RhsXprCleaned>::Cost;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Func, typename LhsXpr, typename MidXpr, typename RhsXpr>
|
||||||
|
struct nested_functor_cost<CwiseTernaryOp<Func, LhsXpr, MidXpr, RhsXpr>> {
|
||||||
|
using LhsXprCleaned = remove_all_t<LhsXpr>;
|
||||||
|
using MidXprCleaned = remove_all_t<MidXpr>;
|
||||||
|
using RhsXprCleaned = remove_all_t<RhsXpr>;
|
||||||
|
using FuncCleaned = remove_all_t<Func>;
|
||||||
|
static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
|
||||||
|
nested_functor_cost<MidXprCleaned>::Cost + nested_functor_cost<RhsXprCleaned>::Cost;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Xpr>
|
||||||
|
struct functor_cost {
|
||||||
|
static constexpr Index Cost = plain_enum_max(nested_functor_cost<Xpr>::Cost, 1);
|
||||||
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct packet_traits;
|
struct packet_traits;
|
||||||
|
|
||||||
|
327
Eigen/src/ThreadPool/CoreThreadPoolDevice.h
Normal file
327
Eigen/src/ThreadPool/CoreThreadPoolDevice.h
Normal file
@ -0,0 +1,327 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
|
||||||
|
#define EIGEN_CORE_THREAD_POOL_DEVICE_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
|
||||||
|
// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
|
||||||
|
// delegating the task to a thread.
|
||||||
|
|
||||||
|
// a
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// a e
|
||||||
|
// / \ / \
|
||||||
|
// / \ / \
|
||||||
|
// / \ / \
|
||||||
|
// a c e g
|
||||||
|
// / \ / \ / \ / \
|
||||||
|
// / \ / \ / \ / \
|
||||||
|
// a b c d e f g h
|
||||||
|
|
||||||
|
// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
|
||||||
|
// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
|
||||||
|
// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
|
||||||
|
// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
|
||||||
|
// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.
|
||||||
|
|
||||||
|
struct CoreThreadPoolDevice {
|
||||||
|
using Task = std::function<void()>;
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
|
||||||
|
: m_pool(pool) {
|
||||||
|
eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
|
||||||
|
m_costFactor = threadCostThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int PacketSize>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
|
||||||
|
eigen_assert(cost >= 0.0f && "cost must be non-negative");
|
||||||
|
Index numOps = size / PacketSize;
|
||||||
|
int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
|
||||||
|
float totalCost = static_cast<float>(numOps) * cost;
|
||||||
|
float idealThreads = totalCost * m_costFactor;
|
||||||
|
if (idealThreads < static_cast<float>(actualThreads)) {
|
||||||
|
idealThreads = numext::maxi(idealThreads, 1.0f);
|
||||||
|
actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
|
||||||
|
}
|
||||||
|
int maxLevel = internal::log2_ceil(actualThreads);
|
||||||
|
return maxLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
// MSVC does not like inlining parallelForImpl
|
||||||
|
#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
|
||||||
|
#define EIGEN_PARALLEL_FOR_INLINE
|
||||||
|
#else
|
||||||
|
#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename UnaryFunctor, int PacketSize>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
|
||||||
|
Barrier& barrier, int level) {
|
||||||
|
while (level > 0) {
|
||||||
|
level--;
|
||||||
|
Index size = end - begin;
|
||||||
|
eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
|
||||||
|
Index mid = begin + numext::round_down(size >> 1, PacketSize);
|
||||||
|
Task right = [=, this, &f, &barrier]() {
|
||||||
|
parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
|
||||||
|
};
|
||||||
|
m_pool.Schedule(std::move(right));
|
||||||
|
end = mid;
|
||||||
|
}
|
||||||
|
for (Index i = begin; i < end; i += PacketSize) f(i);
|
||||||
|
barrier.Notify();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename BinaryFunctor, int PacketSize>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
|
||||||
|
Index innerEnd, BinaryFunctor& f, Barrier& barrier,
|
||||||
|
int level) {
|
||||||
|
while (level > 0) {
|
||||||
|
level--;
|
||||||
|
Index outerSize = outerEnd - outerBegin;
|
||||||
|
if (outerSize > 1) {
|
||||||
|
Index outerMid = outerBegin + (outerSize >> 1);
|
||||||
|
Task right = [=, this, &f, &barrier]() {
|
||||||
|
parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
|
||||||
|
};
|
||||||
|
m_pool.Schedule(std::move(right));
|
||||||
|
outerEnd = outerMid;
|
||||||
|
} else {
|
||||||
|
Index innerSize = innerEnd - innerBegin;
|
||||||
|
eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
|
||||||
|
Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
|
||||||
|
Task right = [=, this, &f, &barrier]() {
|
||||||
|
parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
|
||||||
|
};
|
||||||
|
m_pool.Schedule(std::move(right));
|
||||||
|
innerEnd = innerMid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Index outer = outerBegin; outer < outerEnd; outer++)
|
||||||
|
for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
|
||||||
|
barrier.Notify();
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef EIGEN_PARALLEL_FOR_INLINE
|
||||||
|
|
||||||
|
template <typename UnaryFunctor, int PacketSize>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
|
||||||
|
Index size = end - begin;
|
||||||
|
int maxLevel = calculateLevels<PacketSize>(size, cost);
|
||||||
|
Barrier barrier(1 << maxLevel);
|
||||||
|
parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
|
||||||
|
barrier.Wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename BinaryFunctor, int PacketSize>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
|
||||||
|
Index innerEnd, BinaryFunctor& f, float cost) {
|
||||||
|
Index outerSize = outerEnd - outerBegin;
|
||||||
|
Index innerSize = innerEnd - innerBegin;
|
||||||
|
Index size = outerSize * innerSize;
|
||||||
|
int maxLevel = calculateLevels<PacketSize>(size, cost);
|
||||||
|
Barrier barrier(1 << maxLevel);
|
||||||
|
parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
|
||||||
|
barrier.Wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
ThreadPool& m_pool;
|
||||||
|
// costFactor is the cost of delegating a task to a thread
|
||||||
|
// the inverse is used to avoid a floating point division
|
||||||
|
float m_costFactor;
|
||||||
|
};
|
||||||
|
|
||||||
|
// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct cost_helper {
|
||||||
|
using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
|
||||||
|
using DstEvaluatorType = typename Kernel::DstEvaluatorType;
|
||||||
|
using SrcXprType = typename SrcEvaluatorType::XprType;
|
||||||
|
using DstXprType = typename DstEvaluatorType::XprType;
|
||||||
|
static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
|
||||||
|
this->assignCoeffByOuterInner(outer, inner);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index innerSize = kernel.innerSize();
|
||||||
|
const Index outerSize = kernel.outerSize();
|
||||||
|
constexpr float cost = static_cast<float>(XprEvaluationCost);
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
|
||||||
|
using DstXprType = typename Kernel::DstEvaluatorType::XprType;
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
|
||||||
|
copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index outerSize = kernel.outerSize();
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
|
||||||
|
device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
|
||||||
|
using PacketType = typename Kernel::PacketType;
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
|
||||||
|
SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
|
||||||
|
DstAlignment = Kernel::AssignmentTraits::DstAlignment;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
|
||||||
|
this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index innerSize = kernel.innerSize();
|
||||||
|
const Index outerSize = kernel.outerSize();
|
||||||
|
const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
|
||||||
|
using PacketType = typename Kernel::PacketType;
|
||||||
|
using DstXprType = typename Kernel::DstEvaluatorType::XprType;
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
|
||||||
|
SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
|
||||||
|
DstAlignment = Kernel::AssignmentTraits::DstAlignment,
|
||||||
|
InnerSize = DstXprType::InnerSizeAtCompileTime;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
|
||||||
|
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index outerSize = kernel.outerSize();
|
||||||
|
constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
|
||||||
|
using Scalar = typename Kernel::Scalar;
|
||||||
|
using PacketType = typename Kernel::PacketType;
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
|
||||||
|
struct PacketAssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
|
||||||
|
this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
struct ScalarAssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
|
||||||
|
const Index innerSize = this->innerSize();
|
||||||
|
const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
|
||||||
|
for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index outerSize = kernel.outerSize();
|
||||||
|
const Index innerSize = kernel.innerSize();
|
||||||
|
const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
|
||||||
|
constexpr float packetCost = static_cast<float>(XprEvaluationCost);
|
||||||
|
const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
|
||||||
|
PacketAssignmentFunctor packetFunctor(kernel);
|
||||||
|
ScalarAssignmentFunctor scalarFunctor(kernel);
|
||||||
|
device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
|
||||||
|
packetCost);
|
||||||
|
device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index size = kernel.size();
|
||||||
|
constexpr float cost = static_cast<float>(XprEvaluationCost);
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
|
||||||
|
using Scalar = typename Kernel::Scalar;
|
||||||
|
using PacketType = typename Kernel::PacketType;
|
||||||
|
static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
|
||||||
|
RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
|
||||||
|
PacketSize = unpacket_traits<PacketType>::size,
|
||||||
|
DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
|
||||||
|
DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
|
||||||
|
: Kernel::AssignmentTraits::DstAlignment,
|
||||||
|
SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
|
||||||
|
struct AssignmentFunctor : public Kernel {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
|
||||||
|
this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||||
|
const Index size = kernel.size();
|
||||||
|
const Index alignedStart =
|
||||||
|
DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
|
||||||
|
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
|
||||||
|
|
||||||
|
unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);
|
||||||
|
|
||||||
|
constexpr float cost = static_cast<float>(XprEvaluationCost);
|
||||||
|
AssignmentFunctor functor(kernel);
|
||||||
|
device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
|
||||||
|
|
||||||
|
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_CORE_THREAD_POOL_DEVICE_H
|
@ -343,7 +343,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
|||||||
}
|
}
|
||||||
victim += inc;
|
victim += inc;
|
||||||
if (victim >= size) {
|
if (victim >= size) {
|
||||||
victim -= size;
|
victim -= static_cast<unsigned int>(size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Task();
|
return Task();
|
||||||
@ -431,7 +431,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
|||||||
}
|
}
|
||||||
victim += inc;
|
victim += inc;
|
||||||
if (victim >= size) {
|
if (victim >= size) {
|
||||||
victim -= size;
|
victim -= static_cast<unsigned int>(size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
|
87
test/assignment_threaded.cpp
Normal file
87
test/assignment_threaded.cpp
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#if defined(EIGEN_USE_THREADS)
|
||||||
|
#undef EIGEN_USE_THREADS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
#include "main.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
// conveniently control vectorization logic
|
||||||
|
template <typename Scalar, bool Vectorize>
|
||||||
|
struct scalar_dummy_op {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a; }
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <typename Scalar, bool Vectorize>
|
||||||
|
struct functor_traits<scalar_dummy_op<Scalar, Vectorize> > {
|
||||||
|
enum { Cost = 1'000'000, PacketAccess = Vectorize && packet_traits<Scalar>::Vectorizable };
|
||||||
|
};
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
template <typename PlainObject>
|
||||||
|
void test_threaded_assignment(const PlainObject&, Index rows = PlainObject::RowsAtCompileTime,
|
||||||
|
Index cols = PlainObject::ColsAtCompileTime) {
|
||||||
|
using Scalar = typename PlainObject::Scalar;
|
||||||
|
using VectorizationOff = internal::scalar_dummy_op<Scalar, false>;
|
||||||
|
using VectorizationOn = internal::scalar_dummy_op<Scalar, true>;
|
||||||
|
|
||||||
|
int threads = 4;
|
||||||
|
ThreadPool pool(threads);
|
||||||
|
CoreThreadPoolDevice threadPoolDevice(pool);
|
||||||
|
|
||||||
|
PlainObject dst(rows, cols), ref(rows, cols), rhs(rows, cols);
|
||||||
|
rhs.setRandom();
|
||||||
|
const auto rhs_xpr = rhs.cwiseAbs2();
|
||||||
|
|
||||||
|
// linear access
|
||||||
|
dst.setRandom();
|
||||||
|
ref.setRandom();
|
||||||
|
ref = rhs_xpr.unaryExpr(VectorizationOff());
|
||||||
|
dst.device(threadPoolDevice) = rhs_xpr.unaryExpr(VectorizationOff());
|
||||||
|
VERIFY_IS_CWISE_EQUAL(ref, dst);
|
||||||
|
|
||||||
|
ref = rhs_xpr.unaryExpr(VectorizationOn());
|
||||||
|
dst.device(threadPoolDevice) = rhs_xpr.unaryExpr(VectorizationOn());
|
||||||
|
VERIFY_IS_CWISE_EQUAL(ref, dst);
|
||||||
|
|
||||||
|
// outer-inner access
|
||||||
|
Index blockRows = numext::maxi(Index(1), rows - 1);
|
||||||
|
Index blockCols = numext::maxi(Index(1), cols - 1);
|
||||||
|
dst.setRandom();
|
||||||
|
ref.setRandom();
|
||||||
|
ref.bottomRightCorner(blockRows, blockCols) =
|
||||||
|
rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOff());
|
||||||
|
dst.bottomRightCorner(blockRows, blockCols).device(threadPoolDevice) =
|
||||||
|
rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOff());
|
||||||
|
VERIFY_IS_CWISE_EQUAL(ref.bottomRightCorner(blockRows, blockCols), dst.bottomRightCorner(blockRows, blockCols));
|
||||||
|
|
||||||
|
ref.setZero();
|
||||||
|
dst.setZero();
|
||||||
|
ref.bottomRightCorner(blockRows, blockCols) =
|
||||||
|
rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOn());
|
||||||
|
dst.bottomRightCorner(blockRows, blockCols).device(threadPoolDevice) =
|
||||||
|
rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOn());
|
||||||
|
VERIFY_IS_CWISE_EQUAL(ref.bottomRightCorner(blockRows, blockCols), dst.bottomRightCorner(blockRows, blockCols));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(test) {
|
||||||
|
for (int i = 0; i < g_repeat; i++) {
|
||||||
|
CALL_SUBTEST(test_threaded_assignment(MatrixXd(), 123, 123));
|
||||||
|
CALL_SUBTEST(test_threaded_assignment(Matrix<float, 16, 16>()));
|
||||||
|
}
|
||||||
|
}
|
@ -53,7 +53,7 @@
|
|||||||
#include <random>
|
#include <random>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
#include <future>
|
#include <Eigen/ThreadPool>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#if __cplusplus > 201703L
|
#if __cplusplus > 201703L
|
||||||
|
Loading…
x
Reference in New Issue
Block a user