diff --git a/Eigen/src/Core/ForwardDeclarations.h b/Eigen/src/Core/ForwardDeclarations.h index 36519c7da..32be8cd68 100644 --- a/Eigen/src/Core/ForwardDeclarations.h +++ b/Eigen/src/Core/ForwardDeclarations.h @@ -83,27 +83,30 @@ template struct ei_eval template struct ei_unref { typedef T type; }; template struct ei_unref { typedef T type; }; -template struct ei_xpr_copy +template struct ei_is_temporary { - typedef typename ei_meta_if< ei_traits::Flags & EvalBeforeNestingBit, - typename ei_eval::type, const T&>::ret type; + enum { ret = 0 }; }; -template struct ei_xpr_copy > +template struct ei_is_temporary > { - typedef Temporary type; + enum { ret = 1 }; }; -template struct ei_eval_if_needed_before_nesting +template struct ei_xpr_copy { - // FIXME should we consider the additional store as well as the creation cost of the temporary ? - enum { eval = T::Flags & EvalBeforeNestingBit - || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost }; - typedef typename ei_meta_if::type, T>::ret XprType; - typedef typename ei_meta_if::type, typename T::XprCopy>::ret CopyType; + typedef typename ei_meta_if< + ei_is_temporary::ret, + T, + typename ei_meta_if< + ei_traits::Flags & EvalBeforeNestingBit + || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_eval::type, + const T& + >::ret + >::ret type; }; - template struct ei_functor_traits { enum diff --git a/Eigen/src/Core/OperatorEquals.h b/Eigen/src/Core/OperatorEquals.h index c93a9329f..5529c8313 100644 --- a/Eigen/src/Core/OperatorEquals.h +++ b/Eigen/src/Core/OperatorEquals.h @@ -102,14 +102,15 @@ template Derived& MatrixBase ::lazyAssign(const MatrixBase& other) { + const bool unroll = SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT; if(IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime) // copying a vector expression into a vector { ei_assert(size() == other.size()); - if(SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT) + if(unroll) ei_vector_operator_equals_unroller ::run(derived(), other.derived()); else for(int i = 0; i < size(); i++) @@ -118,11 +119,11 @@ Derived& MatrixBase else // copying a matrix expression into a matrix { ei_assert(rows() == other.rows() && cols() == other.cols()); - if(SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT) + if(unroll) { ei_matrix_operator_equals_unroller ::run(derived(), other.derived()); } else @@ -152,7 +153,7 @@ template Derived& MatrixBase ::operator=(const MatrixBase& other) { - if (OtherDerived::Flags & EvalBeforeAssigningBit) + if(OtherDerived::Flags & EvalBeforeAssigningBit) { return lazyAssign(other.derived().eval()); } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 608de0b9f..d303cbdb7 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -84,21 +84,29 @@ template struct ei_traits > { typedef typename Lhs::Scalar Scalar; + typedef typename ei_xpr_copy::type LhsXprCopy; + typedef typename ei_xpr_copy::type RhsXprCopy; + typedef typename ei_unref::type ActualLhs; + typedef typename ei_unref::type ActualRhs; enum { + LhsCoeffReadCost = ActualLhs::CoeffReadCost, + RhsCoeffReadCost = ActualRhs::CoeffReadCost, + LhsFlags = ActualLhs::Flags, + RhsFlags = ActualRhs::Flags, RowsAtCompileTime = Lhs::RowsAtCompileTime, ColsAtCompileTime = Rhs::ColsAtCompileTime, MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime, MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime, Flags = ( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) - ? (unsigned int)(Lhs::Flags | Rhs::Flags) - : (unsigned int)(Lhs::Flags | Rhs::Flags) & ~LargeBit ) + ? (unsigned int)(LhsFlags | RhsFlags) + : (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit ) | EvalBeforeAssigningBit | (ei_product_eval_mode::value == (int)CacheOptimal ? EvalBeforeNestingBit : 0), CoeffReadCost = Lhs::ColsAtCompileTime == Dynamic ? Dynamic : Lhs::ColsAtCompileTime - * (NumTraits::MulCost + Lhs::CoeffReadCost + Rhs::CoeffReadCost) + * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + (Lhs::ColsAtCompileTime - 1) * NumTraits::AddCost }; }; @@ -110,10 +118,8 @@ template class Product : ei_no_assignm EIGEN_GENERIC_PUBLIC_INTERFACE(Product) - typedef typename ei_eval_if_needed_before_nesting::CopyType CopyLhs; - typedef typename ei_eval_if_needed_before_nesting::CopyType CopyRhs; - typedef typename ei_eval_if_needed_before_nesting::XprType XprLhs; - typedef typename ei_eval_if_needed_before_nesting::XprType XprRhs; + typedef typename ei_traits::LhsXprCopy LhsXprCopy; + typedef typename ei_traits::RhsXprCopy RhsXprCopy; Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) @@ -133,12 +139,15 @@ template class Product : ei_no_assignm const Scalar _coeff(int row, int col) const { Scalar res; - if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT) + const bool unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT; + if(unroll) + { ei_product_unroller + unroll ? Lhs::ColsAtCompileTime : Dynamic, + typename ei_unref::type, + typename ei_unref::type> ::run(row, col, m_lhs, m_rhs, res); + } else { res = m_lhs.coeff(row, 0) * m_rhs.coeff(0, col); @@ -149,8 +158,8 @@ template class Product : ei_no_assignm } protected: - const CopyLhs m_lhs; - const CopyRhs m_rhs; + const LhsXprCopy m_lhs; + const RhsXprCopy m_rhs; }; /** \returns the matrix product of \c *this and \a other. diff --git a/Eigen/src/Core/Util.h b/Eigen/src/Core/Util.h index 10fdacb8b..ad8a15b07 100644 --- a/Eigen/src/Core/Util.h +++ b/Eigen/src/Core/Util.h @@ -31,7 +31,7 @@ /** Defines the maximal loop size to enable meta unrolling of loops */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 16 +#define EIGEN_UNROLLING_LIMIT 400 #endif #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp index ee58607cd..4ff678d8a 100644 --- a/bench/benchmark.cpp +++ b/bench/benchmark.cpp @@ -8,6 +8,10 @@ using namespace std; USING_PART_OF_NAMESPACE_EIGEN +#ifndef REPEAT +#define REPEAT 40000000 +#endif + int main(int argc, char *argv[]) { Matrix I; @@ -19,7 +23,7 @@ int main(int argc, char *argv[]) m(i,j) = (i+MATSIZE*j); } asm("#begin"); - for(int a = 0; a < 40000000; a++) + for(int a = 0; a < REPEAT; a++) { m = I + 0.00005 * (m + m*m); }