From 5330960900f23d84ef998aac6f27a635eed31753 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Mon, 14 Apr 2025 17:44:53 +0000 Subject: [PATCH] Enable packet segment in partial redux --- Eigen/src/Core/AssignEvaluator.h | 3 +- Eigen/src/Core/PartialReduxEvaluator.h | 76 ++++++++++++++++++----- Eigen/src/Core/Redux.h | 7 +++ Eigen/src/Core/VectorwiseOp.h | 3 - Eigen/src/Core/util/ForwardDeclarations.h | 3 - Eigen/src/Core/util/XprHelper.h | 30 --------- 6 files changed, 68 insertions(+), 54 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index a33a21a2b..b4e8794a4 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -136,8 +136,7 @@ struct copy_using_evaluator_traits { : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling) #endif : NoUnrolling; - static constexpr bool UsePacketSegment = - enable_packet_segment::value && enable_packet_segment::value && has_packet_segment::value; + static constexpr bool UsePacketSegment = has_packet_segment::value; #ifdef EIGEN_DEBUG_ASSIGN static void debug() { diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 7b2c8dca3..1f638f9ac 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -103,19 +103,36 @@ struct packetwise_redux_impl { EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) { if (size == 0) return packetwise_redux_empty_value(func); - const Index size4 = (size - 1) & (~3); + const Index size4 = 1 + numext::round_down(size - 1, 4); PacketType p = eval.template packetByOuterInner(0, 0); - Index i = 1; // This loop is optimized for instruction pipelining: // - each iteration generates two independent instructions // - thanks to branch prediction and out-of-order execution we have independent instructions across loops - for (; i < size4; i += 4) + for (Index i = 1; i < size4; i += 4) p = func.packetOp( p, func.packetOp(func.packetOp(eval.template packetByOuterInner(i + 0, 0), eval.template packetByOuterInner(i + 1, 0)), func.packetOp(eval.template packetByOuterInner(i + 2, 0), eval.template packetByOuterInner(i + 3, 0)))); - for (; i < size; ++i) p = func.packetOp(p, eval.template packetByOuterInner(i, 0)); + for (Index i = size4; i < size; ++i) + p = func.packetOp(p, eval.template packetByOuterInner(i, 0)); + return p; + } +}; + +template +struct packetwise_segment_redux_impl { + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin, + Index count) { + if (size == 0) return packetwise_redux_empty_value(func); + + PacketType p = eval.template packetSegmentByOuterInner(0, 0, begin, count); + for (Index i = 1; i < size; ++i) + p = func.packetOp(p, eval.template packetSegmentByOuterInner(i, 0, begin, count)); return p; } }; @@ -174,14 +191,13 @@ struct evaluator > template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const { - enum { PacketSize = internal::unpacket_traits::size }; - typedef Block - PanelType; - - PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0, - Direction == Vertical ? m_arg.rows() : Index(PacketSize), - Direction == Vertical ? Index(PacketSize) : m_arg.cols()); + static constexpr int PacketSize = internal::unpacket_traits::size; + static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize; + static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime; + using PanelType = Block; + using PanelEvaluator = typename internal::redux_evaluator; + using BinaryOp = typename MemberOp::BinaryOp; + using Impl = internal::packetwise_redux_impl; // FIXME // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of @@ -189,11 +205,39 @@ struct evaluator > // by pass "vectorization" in this case: if (PacketSize == 1) return internal::pset1(coeff(idx)); - typedef typename internal::redux_evaluator PanelEvaluator; + Index startRow = Direction == Vertical ? 0 : idx; + Index startCol = Direction == Vertical ? idx : 0; + Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize; + Index numCols = Direction == Vertical ? PacketSize : m_arg.cols(); + + PanelType panel(m_arg, startRow, startCol, numRows, numCols); PanelEvaluator panel_eval(panel); - typedef typename MemberOp::BinaryOp BinaryOp; - PacketType p = internal::packetwise_redux_impl::template run( - panel_eval, m_functor.binaryFunc(), m_arg.outerSize()); + PacketType p = Impl::template run(panel_eval, m_functor.binaryFunc(), m_arg.outerSize()); + return p; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const { + return packetSegment(Direction == Vertical ? j : i, begin, count); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const { + static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic; + static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime; + using PanelType = Block; + using PanelEvaluator = typename internal::redux_evaluator; + using BinaryOp = typename MemberOp::BinaryOp; + using Impl = internal::packetwise_segment_redux_impl; + + Index startRow = Direction == Vertical ? 0 : idx; + Index startCol = Direction == Vertical ? idx : 0; + Index numRows = Direction == Vertical ? m_arg.rows() : begin + count; + Index numCols = Direction == Vertical ? begin + count : m_arg.cols(); + + PanelType panel(m_arg, startRow, startCol, numRows, numCols); + PanelEvaluator panel_eval(panel); + PacketType p = Impl::template run(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count); return p; } diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0c5f2d9f6..4e9ab0e4f 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -414,6 +414,13 @@ class redux_evaluator : public internal::evaluator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin, + Index count) const { + return Base::template packetSegment(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer, + begin, count); + } }; } // end namespace internal diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 1342478cd..b861b233c 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -37,9 +37,6 @@ class PartialReduxExpr; namespace internal { -template -struct enable_packet_segment> : std::false_type {}; - template struct traits > : traits { typedef typename MemberOp::result_type Scalar; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 8d1073c86..3c0bc461e 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -517,9 +517,6 @@ struct eigen_zero_impl; template struct has_packet_segment : std::false_type {}; - -template -struct enable_packet_segment : std::true_type {}; } // namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 24268bfab..a42bb0f73 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -996,36 +996,6 @@ struct is_matrix_base_xpr : std::is_base_of>, r template struct is_permutation_base_xpr : std::is_base_of>, remove_all_t> {}; -/*---------------- load/store segment support ----------------*/ - -// recursively traverse unary, binary, and ternary expressions to determine if packet segments are supported - -template -struct enable_packet_segment> : enable_packet_segment> {}; - -template -struct enable_packet_segment> : enable_packet_segment> {}; - -template -struct enable_packet_segment> - : bool_constant>::value && - enable_packet_segment>::value> {}; - -template -struct enable_packet_segment> - : bool_constant>::value && - enable_packet_segment>::value && - enable_packet_segment>::value> {}; - -template -struct enable_packet_segment> : enable_packet_segment> {}; - -template -struct enable_packet_segment> : enable_packet_segment> {}; - -template -struct enable_packet_segment> : enable_packet_segment> {}; - } // end namespace internal /** \class ScalarBinaryOpTraits