diff --git a/Eigen/Core b/Eigen/Core index ed7d3538f..09d7d000f 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -404,6 +404,7 @@ using std::ptrdiff_t; #endif #include "src/Core/GlobalFunctions.h" +#include "src/Core/DeviceWrapper.h" // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/ThreadPool b/Eigen/ThreadPool index febb1871d..c6e2cee5b 100644 --- a/Eigen/ThreadPool +++ b/Eigen/ThreadPool @@ -71,6 +71,7 @@ #include "src/ThreadPool/ThreadEnvironment.h" #include "src/ThreadPool/Barrier.h" #include "src/ThreadPool/NonBlockingThreadPool.h" +#include "src/ThreadPool/CoreThreadPoolDevice.h" // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h new file mode 100644 index 000000000..9fdbe6079 --- /dev/null +++ b/Eigen/src/Core/DeviceWrapper.h @@ -0,0 +1,155 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_DEVICEWRAPPER_H +#define EIGEN_DEVICEWRAPPER_H + +namespace Eigen { +template +struct DeviceWrapper { + using Base = EigenBase>; + using Scalar = typename Derived::Scalar; + + EIGEN_DEVICE_FUNC DeviceWrapper(Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {} + EIGEN_DEVICE_FUNC DeviceWrapper(const Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase& other) { + using AssignOp = internal::assign_op; + internal::call_assignment(*this, other.derived(), AssignOp()); + return m_xpr; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const EigenBase& other) { + using AddAssignOp = internal::add_assign_op; + internal::call_assignment(*this, other.derived(), AddAssignOp()); + return m_xpr; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const EigenBase& other) { + using SubAssignOp = internal::sub_assign_op; + internal::call_assignment(*this, other.derived(), SubAssignOp()); + return m_xpr; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return m_xpr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Device& device() { return m_device; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NoAlias noalias() { + return NoAlias(*this); + } + + Derived& m_xpr; + Device& m_device; +}; + +namespace internal { + +// this is where we differentiate between lazy assignment and specialized kernels (e.g. matrix products) +template ::Shape, + typename evaluator_traits::Shape>::Kind, + typename EnableIf = void> +struct AssignmentWithDevice; + +// unless otherwise specified, use the default product implementation +template +struct AssignmentWithDevice, Functor, Device, Dense2Dense, Weak> { + using SrcXprType = Product; + using Base = Assignment; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func, + Device&) { + Base::run(dst, src, func); + }; +}; + +// specialization for coeffcient-wise assignment +template +struct AssignmentWithDevice { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func, + Device& device) { +#ifndef EIGEN_NO_DEBUG + internal::check_for_aliasing(dst, src); +#endif + + call_dense_assignment_loop(dst, src, func, device); + } +}; + +// this allows us to use the default evaulation scheme if it is not specialized for the device +template +struct dense_assignment_loop_with_device { + using Base = dense_assignment_loop; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Device&) { Base::run(kernel); } +}; + +// entry point for a generic expression with device +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(DeviceWrapper dst, + const Src& src, const Func& func) { + enum { + NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) || + (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) && + int(Dst::SizeAtCompileTime) != 1 + }; + + using ActualDstTypeCleaned = std::conditional_t, Dst>; + using ActualDstType = std::conditional_t, Dst&>; + ActualDstType actualDst(dst.derived()); + + // TODO check whether this is the right place to perform these checks: + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar); + + // this provides a mechanism for specializing simple assignments, matrix products, etc + AssignmentWithDevice::run(actualDst, src, func, dst.device()); +} + +// copy and pasted from AssignEvaluator except forward device to kernel +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst, + const SrcXprType& src, + const Functor& func, + Device& device) { + using DstEvaluatorType = evaluator; + using SrcEvaluatorType = evaluator; + + SrcEvaluatorType srcEvaluator(src); + + // NOTE To properly handle A = (A*A.transpose())/s with A rectangular, + // we need to resize the destination after the source evaluator has been created. + resize_if_allowed(dst, src, func); + + DstEvaluatorType dstEvaluator(dst); + + using Kernel = generic_dense_assignment_kernel; + + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + + dense_assignment_loop_with_device::run(kernel, device); +} + +} // namespace internal + +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper EigenBase::device(Device& device) { + return DeviceWrapper(derived(), device); +} + +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper EigenBase::device( + Device& device) const { + return DeviceWrapper(derived(), device); +} +} // namespace Eigen +#endif \ No newline at end of file diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index f485016ac..a2d6ee2ec 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -104,6 +104,11 @@ struct EigenBase { // derived class can reimplement it in a more optimized way. dst = this->derived() * dst; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper device(Device& device); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper device(Device& device) const; }; /*************************************************************************** diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index d42fc93cc..feb9099b4 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1249,20 +1249,40 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double trunc(const double& x) { // T is assumed to be an integer type with a>=0, and b>0 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) { + using UnsignedT = typename internal::make_unsigned::type; EIGEN_STATIC_ASSERT((NumTraits::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) eigen_assert(a >= 0); eigen_assert(b > 0); + // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations + const UnsignedT ua = UnsignedT(a); + const UnsignedT ub = UnsignedT(b); // Note: This form is used because it cannot overflow. - return a == 0 ? 0 : (a - 1) / b + 1; + return ua == 0 ? 0 : (ua - 1) / ub + 1; +} + +// Integer round down to nearest power of b +// T is assumed to be an integer type with a>=0, and b>0 +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b) { + using UnsignedT = typename internal::make_unsigned::type; + using UnsignedU = typename internal::make_unsigned::type; + EIGEN_STATIC_ASSERT((NumTraits::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) + EIGEN_STATIC_ASSERT((NumTraits::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) + eigen_assert(a >= 0); + eigen_assert(b > 0); + // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations + const UnsignedT ua = UnsignedT(a); + const UnsignedU ub = UnsignedU(b); + return ub * (ua / ub); } /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ -inline int log2(int x) { +EIGEN_CONSTEXPR inline int log2(int x) { eigen_assert(x >= 0); unsigned int v(x); - static const int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, - 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; + constexpr int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; v |= v >> 1; v |= v >> 2; v |= v >> 4; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 2f2ba9b20..cf2535928 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -502,6 +502,9 @@ struct stem_function { }; } // namespace internal +template +struct DeviceWrapper; + } // end namespace Eigen #endif // EIGEN_FORWARDDECLARATIONS_H diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a6a7d3fbb..cecbee865 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -206,6 +206,64 @@ struct functor_traits { enum { Cost = 10, PacketAccess = false, IsRepeatable = false }; }; +// estimates the cost of lazily evaluating a generic functor by unwinding the expression +template +struct nested_functor_cost { + static constexpr Index Cost = static_cast(functor_traits::Cost); +}; + +template +struct nested_functor_cost> { + static constexpr Index Cost = 1; +}; + +template +struct nested_functor_cost> { + static constexpr Index Cost = 1; +}; + +// TODO: assign a cost to the stride type? +template +struct nested_functor_cost> : nested_functor_cost {}; + +template +struct nested_functor_cost> { + using XprCleaned = remove_all_t; + using FuncCleaned = remove_all_t; + static constexpr Index Cost = nested_functor_cost::Cost + nested_functor_cost::Cost; +}; + +template +struct nested_functor_cost> { + using XprCleaned = remove_all_t; + using FuncCleaned = remove_all_t; + static constexpr Index Cost = nested_functor_cost::Cost + nested_functor_cost::Cost; +}; + +template +struct nested_functor_cost> { + using LhsXprCleaned = remove_all_t; + using RhsXprCleaned = remove_all_t; + using FuncCleaned = remove_all_t; + static constexpr Index Cost = nested_functor_cost::Cost + nested_functor_cost::Cost + + nested_functor_cost::Cost; +}; + +template +struct nested_functor_cost> { + using LhsXprCleaned = remove_all_t; + using MidXprCleaned = remove_all_t; + using RhsXprCleaned = remove_all_t; + using FuncCleaned = remove_all_t; + static constexpr Index Cost = nested_functor_cost::Cost + nested_functor_cost::Cost + + nested_functor_cost::Cost + nested_functor_cost::Cost; +}; + +template +struct functor_cost { + static constexpr Index Cost = plain_enum_max(nested_functor_cost::Cost, 1); +}; + template struct packet_traits; diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h new file mode 100644 index 000000000..5611a22bd --- /dev/null +++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h @@ -0,0 +1,327 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H +#define EIGEN_CORE_THREAD_POOL_DEVICE_H + +namespace Eigen { + +// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with +// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for +// delegating the task to a thread. + +// a +// / \ +// / \ +// / \ +// / \ +// / \ +// / \ +// / \ +// a e +// / \ / \ +// / \ / \ +// / \ / \ +// a c e g +// / \ / \ / \ / \ +// / \ / \ / \ / \ +// a b c d e f g h + +// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the +// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number +// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The +// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the +// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression. + +struct CoreThreadPoolDevice { + using Task = std::function; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f) + : m_pool(pool) { + eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative"); + m_costFactor = threadCostThreshold; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const { + eigen_assert(cost >= 0.0f && "cost must be non-negative"); + Index numOps = size / PacketSize; + int actualThreads = numOps < m_pool.NumThreads() ? static_cast(numOps) : m_pool.NumThreads(); + float totalCost = static_cast(numOps) * cost; + float idealThreads = totalCost * m_costFactor; + if (idealThreads < static_cast(actualThreads)) { + idealThreads = numext::maxi(idealThreads, 1.0f); + actualThreads = numext::mini(actualThreads, static_cast(idealThreads)); + } + int maxLevel = internal::log2_ceil(actualThreads); + return maxLevel; + } + +// MSVC does not like inlining parallelForImpl +#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG +#define EIGEN_PARALLEL_FOR_INLINE +#else +#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE +#endif + + template + EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f, + Barrier& barrier, int level) { + while (level > 0) { + level--; + Index size = end - begin; + eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize"); + Index mid = begin + numext::round_down(size >> 1, PacketSize); + Task right = [=, this, &f, &barrier]() { + parallelForImpl(mid, end, f, barrier, level); + }; + m_pool.Schedule(std::move(right)); + end = mid; + } + for (Index i = begin; i < end; i += PacketSize) f(i); + barrier.Notify(); + } + + template + EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin, + Index innerEnd, BinaryFunctor& f, Barrier& barrier, + int level) { + while (level > 0) { + level--; + Index outerSize = outerEnd - outerBegin; + if (outerSize > 1) { + Index outerMid = outerBegin + (outerSize >> 1); + Task right = [=, this, &f, &barrier]() { + parallelForImpl(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level); + }; + m_pool.Schedule(std::move(right)); + outerEnd = outerMid; + } else { + Index innerSize = innerEnd - innerBegin; + eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize"); + Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize); + Task right = [=, this, &f, &barrier]() { + parallelForImpl(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level); + }; + m_pool.Schedule(std::move(right)); + innerEnd = innerMid; + } + } + for (Index outer = outerBegin; outer < outerEnd; outer++) + for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner); + barrier.Notify(); + } + +#undef EIGEN_PARALLEL_FOR_INLINE + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) { + Index size = end - begin; + int maxLevel = calculateLevels(size, cost); + Barrier barrier(1 << maxLevel); + parallelForImpl(begin, end, f, barrier, maxLevel); + barrier.Wait(); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin, + Index innerEnd, BinaryFunctor& f, float cost) { + Index outerSize = outerEnd - outerBegin; + Index innerSize = innerEnd - innerBegin; + Index size = outerSize * innerSize; + int maxLevel = calculateLevels(size, cost); + Barrier barrier(1 << maxLevel); + parallelForImpl(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel); + barrier.Wait(); + } + + ThreadPool& m_pool; + // costFactor is the cost of delegating a task to a thread + // the inverse is used to avoid a floating point division + float m_costFactor; +}; + +// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice + +namespace internal { + +template +struct cost_helper { + using SrcEvaluatorType = typename Kernel::SrcEvaluatorType; + using DstEvaluatorType = typename Kernel::DstEvaluatorType; + using SrcXprType = typename SrcEvaluatorType::XprType; + using DstXprType = typename DstEvaluatorType::XprType; + static constexpr Index Cost = functor_cost::Cost + functor_cost::Cost; +}; + +template +struct dense_assignment_loop_with_device { + static constexpr Index XprEvaluationCost = cost_helper::Cost; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { + this->assignCoeffByOuterInner(outer, inner); + } + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index innerSize = kernel.innerSize(); + const Index outerSize = kernel.outerSize(); + constexpr float cost = static_cast(XprEvaluationCost); + AssignmentFunctor functor(kernel); + device.template parallelFor(0, outerSize, 0, innerSize, functor, cost); + } +}; + +template +struct dense_assignment_loop_with_device { + using DstXprType = typename Kernel::DstEvaluatorType::XprType; + static constexpr Index XprEvaluationCost = cost_helper::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(*this, outer); + } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index outerSize = kernel.outerSize(); + AssignmentFunctor functor(kernel); + constexpr float cost = static_cast(XprEvaluationCost) * static_cast(InnerSize); + device.template parallelFor(0, outerSize, functor, cost); + } +}; + +template +struct dense_assignment_loop_with_device { + using PacketType = typename Kernel::PacketType; + static constexpr Index XprEvaluationCost = cost_helper::Cost, PacketSize = unpacket_traits::size, + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { + this->template assignPacketByOuterInner(outer, inner); + } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index innerSize = kernel.innerSize(); + const Index outerSize = kernel.outerSize(); + const float cost = static_cast(XprEvaluationCost) * static_cast(innerSize); + AssignmentFunctor functor(kernel); + device.template parallelFor(0, outerSize, 0, innerSize, functor, cost); + } +}; + +template +struct dense_assignment_loop_with_device { + using PacketType = typename Kernel::PacketType; + using DstXprType = typename Kernel::DstEvaluatorType::XprType; + static constexpr Index XprEvaluationCost = cost_helper::Cost, PacketSize = unpacket_traits::size, + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment, + InnerSize = DstXprType::InnerSizeAtCompileTime; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { + copy_using_evaluator_innervec_InnerUnrolling::run(*this, outer); + } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index outerSize = kernel.outerSize(); + constexpr float cost = static_cast(XprEvaluationCost) * static_cast(InnerSize); + AssignmentFunctor functor(kernel); + device.template parallelFor(0, outerSize, functor, cost); + } +}; + +template +struct dense_assignment_loop_with_device { + using Scalar = typename Kernel::Scalar; + using PacketType = typename Kernel::PacketType; + static constexpr Index XprEvaluationCost = cost_helper::Cost, PacketSize = unpacket_traits::size; + struct PacketAssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { + this->template assignPacketByOuterInner(outer, inner); + } + }; + struct ScalarAssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { + const Index innerSize = this->innerSize(); + const Index packetAccessSize = numext::round_down(innerSize, PacketSize); + for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner); + } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index outerSize = kernel.outerSize(); + const Index innerSize = kernel.innerSize(); + const Index packetAccessSize = numext::round_down(innerSize, PacketSize); + constexpr float packetCost = static_cast(XprEvaluationCost); + const float scalarCost = static_cast(XprEvaluationCost) * static_cast(innerSize - packetAccessSize); + PacketAssignmentFunctor packetFunctor(kernel); + ScalarAssignmentFunctor scalarFunctor(kernel); + device.template parallelFor(0, outerSize, 0, packetAccessSize, packetFunctor, + packetCost); + device.template parallelFor(0, outerSize, scalarFunctor, scalarCost); + }; +}; + +template +struct dense_assignment_loop_with_device { + static constexpr Index XprEvaluationCost = cost_helper::Cost; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index size = kernel.size(); + constexpr float cost = static_cast(XprEvaluationCost); + AssignmentFunctor functor(kernel); + device.template parallelFor(0, size, functor, cost); + } +}; + +template +struct dense_assignment_loop_with_device { + using Scalar = typename Kernel::Scalar; + using PacketType = typename Kernel::PacketType; + static constexpr Index XprEvaluationCost = cost_helper::Cost, + RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, + PacketSize = unpacket_traits::size, + DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment, + DstAlignment = packet_traits::AlignedOnScalar ? RequestedAlignment + : Kernel::AssignmentTraits::DstAlignment, + SrcAlignment = Kernel::AssignmentTraits::JointAlignment; + struct AssignmentFunctor : public Kernel { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { + this->template assignPacket(index); + } + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { + const Index size = kernel.size(); + const Index alignedStart = + DstIsAligned ? 0 : internal::first_aligned(kernel.dstDataPtr(), size); + const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); + + unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); + + constexpr float cost = static_cast(XprEvaluationCost); + AssignmentFunctor functor(kernel); + device.template parallelFor(alignedStart, alignedEnd, functor, cost); + + unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + } +}; + +} // namespace internal + +} // namespace Eigen + +#endif // EIGEN_CORE_THREAD_POOL_DEVICE_H diff --git a/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/Eigen/src/ThreadPool/NonBlockingThreadPool.h index efa6ef571..66fd63c71 100644 --- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h +++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h @@ -343,7 +343,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { } victim += inc; if (victim >= size) { - victim -= size; + victim -= static_cast(size); } } return Task(); @@ -431,7 +431,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { } victim += inc; if (victim >= size) { - victim -= size; + victim -= static_cast(size); } } return -1; diff --git a/test/assignment_threaded.cpp b/test/assignment_threaded.cpp new file mode 100644 index 000000000..0ba8c219a --- /dev/null +++ b/test/assignment_threaded.cpp @@ -0,0 +1,87 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_THREADS) +#undef EIGEN_USE_THREADS +#endif + +#define EIGEN_USE_THREADS + +#include "main.h" + +namespace Eigen { +namespace internal { +// conveniently control vectorization logic +template +struct scalar_dummy_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { + return a; + } +}; +template +struct functor_traits > { + enum { Cost = 1'000'000, PacketAccess = Vectorize && packet_traits::Vectorizable }; +}; +} // namespace internal +} // namespace Eigen + +template +void test_threaded_assignment(const PlainObject&, Index rows = PlainObject::RowsAtCompileTime, + Index cols = PlainObject::ColsAtCompileTime) { + using Scalar = typename PlainObject::Scalar; + using VectorizationOff = internal::scalar_dummy_op; + using VectorizationOn = internal::scalar_dummy_op; + + int threads = 4; + ThreadPool pool(threads); + CoreThreadPoolDevice threadPoolDevice(pool); + + PlainObject dst(rows, cols), ref(rows, cols), rhs(rows, cols); + rhs.setRandom(); + const auto rhs_xpr = rhs.cwiseAbs2(); + + // linear access + dst.setRandom(); + ref.setRandom(); + ref = rhs_xpr.unaryExpr(VectorizationOff()); + dst.device(threadPoolDevice) = rhs_xpr.unaryExpr(VectorizationOff()); + VERIFY_IS_CWISE_EQUAL(ref, dst); + + ref = rhs_xpr.unaryExpr(VectorizationOn()); + dst.device(threadPoolDevice) = rhs_xpr.unaryExpr(VectorizationOn()); + VERIFY_IS_CWISE_EQUAL(ref, dst); + + // outer-inner access + Index blockRows = numext::maxi(Index(1), rows - 1); + Index blockCols = numext::maxi(Index(1), cols - 1); + dst.setRandom(); + ref.setRandom(); + ref.bottomRightCorner(blockRows, blockCols) = + rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOff()); + dst.bottomRightCorner(blockRows, blockCols).device(threadPoolDevice) = + rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOff()); + VERIFY_IS_CWISE_EQUAL(ref.bottomRightCorner(blockRows, blockCols), dst.bottomRightCorner(blockRows, blockCols)); + + ref.setZero(); + dst.setZero(); + ref.bottomRightCorner(blockRows, blockCols) = + rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOn()); + dst.bottomRightCorner(blockRows, blockCols).device(threadPoolDevice) = + rhs_xpr.bottomRightCorner(blockRows, blockCols).unaryExpr(VectorizationOn()); + VERIFY_IS_CWISE_EQUAL(ref.bottomRightCorner(blockRows, blockCols), dst.bottomRightCorner(blockRows, blockCols)); +} + +EIGEN_DECLARE_TEST(test) { + for (int i = 0; i < g_repeat; i++) { + CALL_SUBTEST(test_threaded_assignment(MatrixXd(), 123, 123)); + CALL_SUBTEST(test_threaded_assignment(Matrix())); + } +} diff --git a/test/main.h b/test/main.h index bce67368b..2e849f0fe 100644 --- a/test/main.h +++ b/test/main.h @@ -53,7 +53,7 @@ #include #include #ifdef EIGEN_USE_THREADS -#include +#include #endif #endif #if __cplusplus > 201703L