From 1dd1f8e454512a01bcab1ebe9bd77bf9de09ae22 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 9 Oct 2018 23:36:50 +0200 Subject: [PATCH] bug #65: add vectorization of partial reductions along the outer-dimension, for instance: colmajor_mat.rowwise().mean() --- Eigen/Core | 1 + Eigen/src/Core/CoreEvaluators.h | 58 ------- Eigen/src/Core/PartialReduxEvaluator.h | 224 +++++++++++++++++++++++++ Eigen/src/Core/VectorwiseOp.h | 78 +++++---- test/vectorwiseop.cpp | 1 + 5 files changed, 270 insertions(+), 92 deletions(-) create mode 100644 Eigen/src/Core/PartialReduxEvaluator.h diff --git a/Eigen/Core b/Eigen/Core index 6fd32dd82..a4596e73b 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -306,6 +306,7 @@ using std::ptrdiff_t; #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" +#include "src/Core/PartialReduxEvaluator.h" #include "src/Core/Random.h" #include "src/Core/Replicate.h" #include "src/Core/Reverse.h" diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 264446f65..d5da5cdec 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1325,64 +1325,6 @@ protected: const variable_if_dynamic m_cols; }; - -// -------------------- PartialReduxExpr -------------------- - -template< typename ArgType, typename MemberOp, int Direction> -struct evaluator > - : evaluator_base > -{ - typedef PartialReduxExpr XprType; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; - typedef typename ArgType::Scalar InputScalar; - typedef typename XprType::Scalar Scalar; - enum { - TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) - }; - typedef typename MemberOp::template Cost CostOpType; - enum { - CoeffReadCost = TraversalSize==Dynamic ? HugeCost - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, - - Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized - }; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) - : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index i, Index j) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(j)); - else - return m_functor(m_arg.row(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index index) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(index)); - else - return m_functor(m_arg.row(index)); - } - -protected: - typename internal::add_const_on_value_type::type m_arg; - const MemberOp m_functor; -}; - - // -------------------- MatrixWrapper and ArrayWrapper -------------------- // // evaluator_wrapper_base is a common base class for the diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h new file mode 100644 index 000000000..0bf8a50e0 --- /dev/null +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -0,0 +1,224 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PARTIALREDUX_H +#define EIGEN_PARTIALREDUX_H + +namespace Eigen { + +namespace internal { + + +/*************************************************************************** +* +* This file provides evaluators for partial reductions. +* There are two modes: +* +* - scalar path: simply calls the respective function on the column or row. +* -> nothing special here, all the tricky part is handled by the return +* types of VectorwiseOp's members. They embed the functor calling the +* respective DenseBase's member function. +* +* - vectorized path: implements a packet-wise reductions followed by +* some (optional) processing of the outcome, e.g., division by n for mean. +* +* For the vectorized path let's observe that the packet-size and outer-unrolling +* are both decided by the assignement logic. So all we have to do is to decide +* on the inner unrolling. +* +* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, +* but be need to be careful to specify correct increment. +* +***************************************************************************/ + + +/* logic deciding a strategy for unrolling of vectorized paths */ +template +struct packetwise_redux_traits +{ + enum { + OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, + Cost = OuterSize == Dynamic ? HugeCost + : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, + Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling + }; + +}; + +/* Value to be returned when size==0 , by default let's return 0 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } + +/* For products the default is 1 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } + +/* Perform the actual reduction */ +template::Unrolling +> +struct packetwise_redux_impl; + +/* Perform the actual reduction with unrolling */ +template +struct packetwise_redux_impl +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) + { + return redux_vec_unroller::OuterSize>::template run(eval,func); + } +}; + +/* Add a specialization of redux_vec_unroller for size==0 at compiletime. + * This specialization is not required for general reductions, which is + * why it is defined here. + */ +template +struct redux_vec_unroller +{ + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) + { + return packetwise_redux_empty_value(f); + } +}; + +/* Perform the actual reduction for dynamic sizes */ +template +struct packetwise_redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC + static PacketType run(const Evaluator &eval, const Func& func, Index size) + { + if(size==0) + return packetwise_redux_empty_value(func); + + const Index size4 = (size-1)&(~3); + PacketType p = eval.template packetByOuterInner(0,0); + Index i = 1; + // This loop is optimized for instruction pipelining: + // - each iteration generates two independent instructions + // - thanks to branch prediction and out-of-order execution we have independent instructions across loops + for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), + func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); + for(; i(i,0)); + return p; + } +}; + +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize==0 ? 1 + : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), + + _ArgFlags = evaluator::Flags, + + _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) + && bool(MemberOp::Vectorizable) + && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) + && (TraversalSize!=0), + + Flags = (traits::Flags&RowMajorBit) + | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) + | (_Vectorizable ? PacketAccessBit : 0) + | LinearAccessBit, + + Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized + }; + + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + return coeff(Direction==Vertical ? j : i); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + return m_functor(m_arg.template subVector(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketType packet(Index i, Index j) const + { + return packet(Direction==Vertical ? j : i); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + PacketType packet(Index idx) const + { + enum { PacketSize = internal::unpacket_traits::size }; + typedef Block PanelType; + + PanelType panel(m_arg, + Direction==Vertical ? 0 : idx, + Direction==Vertical ? idx : 0, + Direction==Vertical ? m_arg.rows() : Index(PacketSize), + Direction==Vertical ? Index(PacketSize) : m_arg.cols()); + + typedef typename internal::redux_evaluator PanelEvaluator; + PanelEvaluator panel_eval(panel); + typedef typename MemberOp::BinaryOp BinaryOp; + PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); + return p; + } + +protected: + typename internal::add_const_on_value_type::type m_arg; + const MemberOp m_functor; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PARTIALREDUX_H diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 2a72c3cdd..a88b6e736 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -81,39 +81,46 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr \ - struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ - typedef ResultType result_type; \ - template struct Cost \ - { enum { value = COST }; }; \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - ResultType operator()(const XprType& mat) const \ - { return mat.MEMBER(); } \ +template struct partial_redux_dummy_func; + +#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + typedef BINARYOP BinaryOp; \ + template struct Cost { enum { value = COST }; }; \ + enum { Vectorizable = VECTORIZABLE }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ + BinaryOp binaryFunc() const { return BinaryOp(); } \ } +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func) + namespace internal { -EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); -EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); -EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); -template +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits::AddCost, 1, internal::scalar_sum_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_min_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_max_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits::MulCost, 1, internal::scalar_product_op); + +template struct member_lpnorm { typedef ResultType result_type; - template struct Cost + enum { Vectorizable = 0 }; + template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; EIGEN_DEVICE_FUNC member_lpnorm() {} template @@ -121,17 +128,20 @@ struct member_lpnorm { { return mat.template lpNorm

(); } }; -template +template struct member_redux { + typedef BinaryOpT BinaryOp; typedef typename result_of< BinaryOp(const Scalar&,const Scalar&) >::type result_type; - template struct Cost - { enum { value = (Size-1) * functor_traits::Cost }; }; + + enum { Vectorizable = functor_traits::PacketAccess }; + template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} template EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const { return mat.redux(m_functor); } + const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; } @@ -175,11 +185,11 @@ template class VectorwiseOp typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; - template class Functor, - typename Scalar_=Scalar> struct ReturnType + template class Functor, + typename ReturnScalar=Scalar> struct ReturnType { typedef PartialReduxExpr, + Functor, Direction > Type; }; @@ -294,22 +304,22 @@ template class VectorwiseOp typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; - typedef typename ReturnType::Type SquaredNormReturnType; - typedef typename ReturnType::Type NormReturnType; + typedef PartialReduxExpr, const ExpressionTypeNestedCleaned>,internal::member_sum,Direction> SquaredNormReturnType; + typedef CwiseUnaryOp, const SquaredNormReturnType> NormReturnType; typedef typename ReturnType::Type BlueNormReturnType; typedef typename ReturnType::Type StableNormReturnType; typedef typename ReturnType::Type HypotNormReturnType; typedef typename ReturnType::Type SumReturnType; - typedef typename ReturnType::Type MeanReturnType; + typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType; typedef typename ReturnType::Type AllReturnType; typedef typename ReturnType::Type AnyReturnType; - typedef PartialReduxExpr, Direction> CountReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; typedef typename ReturnType::Type ProdReturnType; typedef Reverse ConstReverseReturnType; typedef Reverse ReverseReturnType; template struct LpNormReturnType { - typedef PartialReduxExpr,Direction> Type; + typedef PartialReduxExpr,Direction> Type; }; /** \returns a row (or column) vector expression of the smallest coefficient @@ -348,7 +358,7 @@ template class VectorwiseOp * \sa DenseBase::squaredNorm() */ EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const - { return SquaredNormReturnType(_expression()); } + { return SquaredNormReturnType(m_matrix.cwiseAbs2()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -360,7 +370,7 @@ template class VectorwiseOp * \sa DenseBase::norm() */ EIGEN_DEVICE_FUNC const NormReturnType norm() const - { return NormReturnType(_expression()); } + { return NormReturnType(squaredNorm()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -425,7 +435,7 @@ template class VectorwiseOp * \sa DenseBase::mean() */ EIGEN_DEVICE_FUNC const MeanReturnType mean() const - { return MeanReturnType(_expression()); } + { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); } /** \returns a row (or column) vector expression representing * whether \b all coefficients of each respective column (or row) are \c true. @@ -630,7 +640,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type>::Type> + const typename OppositeExtendedType::Type> normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 96a9bb0ee..a6745cb85 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -256,6 +256,7 @@ EIGEN_DECLARE_TEST(vectorwiseop) CALL_SUBTEST_2( vectorwiseop_array(Array()) ); CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) ); CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) ); + CALL_SUBTEST_5( vectorwiseop_matrix(Matrix4f()) ); CALL_SUBTEST_5( vectorwiseop_matrix(Matrix()) ); CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random(1,EIGEN_TEST_MAX_SIZE))) );