bug #65: add vectorization of partial reductions along the outer-dimension, for instance: colmajor_mat.rowwise().mean()

This commit is contained in:
Gael Guennebaud 2018-10-09 23:36:50 +02:00
parent bfa2a81a50
commit 1dd1f8e454
5 changed files with 270 additions and 92 deletions

View File

@ -306,6 +306,7 @@ using std::ptrdiff_t;
#include "src/Core/BooleanRedux.h"
#include "src/Core/Select.h"
#include "src/Core/VectorwiseOp.h"
#include "src/Core/PartialReduxEvaluator.h"
#include "src/Core/Random.h"
#include "src/Core/Replicate.h"
#include "src/Core/Reverse.h"

View File

@ -1325,64 +1325,6 @@ protected:
const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
};
// -------------------- PartialReduxExpr --------------------
template< typename ArgType, typename MemberOp, int Direction>
struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
: evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
{
typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
typedef typename ArgType::Scalar InputScalar;
typedef typename XprType::Scalar Scalar;
enum {
TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime)
};
typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
enum {
CoeffReadCost = TraversalSize==Dynamic ? HugeCost
: TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
};
EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
: m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
{
EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Scalar coeff(Index i, Index j) const
{
if (Direction==Vertical)
return m_functor(m_arg.col(j));
else
return m_functor(m_arg.row(i));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Scalar coeff(Index index) const
{
if (Direction==Vertical)
return m_functor(m_arg.col(index));
else
return m_functor(m_arg.row(index));
}
protected:
typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
const MemberOp m_functor;
};
// -------------------- MatrixWrapper and ArrayWrapper --------------------
//
// evaluator_wrapper_base<T> is a common base class for the

View File

@ -0,0 +1,224 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PARTIALREDUX_H
#define EIGEN_PARTIALREDUX_H
namespace Eigen {
namespace internal {
/***************************************************************************
*
* This file provides evaluators for partial reductions.
* There are two modes:
*
* - scalar path: simply calls the respective function on the column or row.
* -> nothing special here, all the tricky part is handled by the return
* types of VectorwiseOp's members. They embed the functor calling the
* respective DenseBase's member function.
*
* - vectorized path: implements a packet-wise reductions followed by
* some (optional) processing of the outcome, e.g., division by n for mean.
*
* For the vectorized path let's observe that the packet-size and outer-unrolling
* are both decided by the assignement logic. So all we have to do is to decide
* on the inner unrolling.
*
* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
* but be need to be careful to specify correct increment.
*
***************************************************************************/
/* logic deciding a strategy for unrolling of vectorized paths */
template<typename Func, typename Evaluator>
struct packetwise_redux_traits
{
enum {
OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
Cost = OuterSize == Dynamic ? HugeCost
: OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
};
};
/* Value to be returned when size==0 , by default let's return 0 */
template<typename PacketType,typename Func>
EIGEN_DEVICE_FUNC
PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
/* For products the default is 1 */
template<typename PacketType,typename Scalar>
EIGEN_DEVICE_FUNC
PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
/* Perform the actual reduction */
template<typename Func, typename Evaluator,
int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
>
struct packetwise_redux_impl;
/* Perform the actual reduction with unrolling */
template<typename Func, typename Evaluator>
struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
{
typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
typedef typename Evaluator::Scalar Scalar;
template<typename PacketType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
{
return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
}
};
/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
* This specialization is not required for general reductions, which is
* why it is defined here.
*/
template<typename Func, typename Evaluator, int Start>
struct redux_vec_unroller<Func, Evaluator, Start, 0>
{
template<typename PacketType>
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
{
return packetwise_redux_empty_value<PacketType>(f);
}
};
/* Perform the actual reduction for dynamic sizes */
template<typename Func, typename Evaluator>
struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
{
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
template<typename PacketType>
EIGEN_DEVICE_FUNC
static PacketType run(const Evaluator &eval, const Func& func, Index size)
{
if(size==0)
return packetwise_redux_empty_value<PacketType>(func);
const Index size4 = (size-1)&(~3);
PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
Index i = 1;
// This loop is optimized for instruction pipelining:
// - each iteration generates two independent instructions
// - thanks to branch prediction and out-of-order execution we have independent instructions across loops
for(; i<size4; i+=4)
p = func.packetOp(p,
func.packetOp(
func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
for(; i<size; ++i)
p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
return p;
}
};
template< typename ArgType, typename MemberOp, int Direction>
struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
: evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
{
typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
typedef typename ArgType::Scalar InputScalar;
typedef typename XprType::Scalar Scalar;
enum {
TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime)
};
typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
enum {
CoeffReadCost = TraversalSize==Dynamic ? HugeCost
: TraversalSize==0 ? 1
: TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
_ArgFlags = evaluator<ArgType>::Flags,
_Vectorizable = bool(int(_ArgFlags)&PacketAccessBit)
&& bool(MemberOp::Vectorizable)
&& (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
&& (TraversalSize!=0),
Flags = (traits<XprType>::Flags&RowMajorBit)
| (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
| (_Vectorizable ? PacketAccessBit : 0)
| LinearAccessBit,
Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
};
EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
: m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
{
EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Scalar coeff(Index i, Index j) const
{
return coeff(Direction==Vertical ? j : i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Scalar coeff(Index index) const
{
return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
}
template<int LoadMode,typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketType packet(Index i, Index j) const
{
return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
}
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
PacketType packet(Index idx) const
{
enum { PacketSize = internal::unpacket_traits<PacketType>::size };
typedef Block<const ArgType,
Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
true /* InnerPanel */> PanelType;
PanelType panel(m_arg,
Direction==Vertical ? 0 : idx,
Direction==Vertical ? idx : 0,
Direction==Vertical ? m_arg.rows() : Index(PacketSize),
Direction==Vertical ? Index(PacketSize) : m_arg.cols());
typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
PanelEvaluator panel_eval(panel);
typedef typename MemberOp::BinaryOp BinaryOp;
PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
return p;
}
protected:
typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
const MemberOp m_functor;
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PARTIALREDUX_H

View File

@ -81,39 +81,46 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
const MemberOp m_functor;
};
#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
template <typename ResultType> \
struct member_##MEMBER { \
EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \
typedef ResultType result_type; \
template<typename Scalar, int Size> struct Cost \
{ enum { value = COST }; }; \
template<typename XprType> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
ResultType operator()(const XprType& mat) const \
{ return mat.MEMBER(); } \
template<typename A,typename B> struct partial_redux_dummy_func;
#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \
template <typename ResultType,typename Scalar> \
struct member_##MEMBER { \
EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \
typedef ResultType result_type; \
typedef BINARYOP<Scalar,Scalar> BinaryOp; \
template<int Size> struct Cost { enum { value = COST }; }; \
enum { Vectorizable = VECTORIZABLE }; \
template<typename XprType> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
ResultType operator()(const XprType& mat) const \
{ return mat.MEMBER(); } \
BinaryOp binaryFunc() const { return BinaryOp(); } \
}
#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
namespace internal {
EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
template <int p, typename ResultType>
EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
template <int p, typename ResultType,typename Scalar>
struct member_lpnorm {
typedef ResultType result_type;
template<typename Scalar, int Size> struct Cost
enum { Vectorizable = 0 };
template<int Size> struct Cost
{ enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
EIGEN_DEVICE_FUNC member_lpnorm() {}
template<typename XprType>
@ -121,17 +128,20 @@ struct member_lpnorm {
{ return mat.template lpNorm<p>(); }
};
template <typename BinaryOp, typename Scalar>
template <typename BinaryOpT, typename Scalar>
struct member_redux {
typedef BinaryOpT BinaryOp;
typedef typename result_of<
BinaryOp(const Scalar&,const Scalar&)
>::type result_type;
template<typename _Scalar, int Size> struct Cost
{ enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
template<typename Derived>
EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
{ return mat.redux(m_functor); }
const BinaryOp& binaryFunc() const { return m_functor; }
const BinaryOp m_functor;
};
}
@ -175,11 +185,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
template<template<typename _Scalar> class Functor,
typename Scalar_=Scalar> struct ReturnType
template<template<typename OutScalar,typename InputScalar> class Functor,
typename ReturnScalar=Scalar> struct ReturnType
{
typedef PartialReduxExpr<ExpressionType,
Functor<Scalar_>,
Functor<ReturnScalar,Scalar>,
Direction
> Type;
};
@ -294,22 +304,22 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
typedef typename ReturnType<internal::member_all>::Type AllReturnType;
typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
typedef Reverse<ExpressionType, Direction> ReverseReturnType;
template<int p> struct LpNormReturnType {
typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
};
/** \returns a row (or column) vector expression of the smallest coefficient
@ -348,7 +358,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::squaredNorm() */
EIGEN_DEVICE_FUNC
const SquaredNormReturnType squaredNorm() const
{ return SquaredNormReturnType(_expression()); }
{ return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression.
@ -360,7 +370,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::norm() */
EIGEN_DEVICE_FUNC
const NormReturnType norm() const
{ return NormReturnType(_expression()); }
{ return NormReturnType(squaredNorm()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression.
@ -425,7 +435,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::mean() */
EIGEN_DEVICE_FUNC
const MeanReturnType mean() const
{ return MeanReturnType(_expression()); }
{ return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
/** \returns a row (or column) vector expression representing
* whether \b all coefficients of each respective column (or row) are \c true.
@ -630,7 +640,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
const ExpressionTypeNestedCleaned,
const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
const typename OppositeExtendedType<NormReturnType>::Type>
normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }

View File

@ -256,6 +256,7 @@ EIGEN_DECLARE_TEST(vectorwiseop)
CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) );
CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) );
CALL_SUBTEST_5( vectorwiseop_matrix(Matrix4f()) );
CALL_SUBTEST_5( vectorwiseop_matrix(Matrix<float,4,5>()) );
CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );