From 713c92140c0033265b91ea0089bf6af5a89dff4c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 23 Jul 2009 14:20:45 +0200 Subject: [PATCH] improve SYMV it is now faster and ready for use --- Eigen/src/Core/SelfAdjointView.h | 67 ++++++++++---- .../Core/products/SelfadjointMatrixVector.h | 87 ++++++++++++------- test/product_selfadjoint.cpp | 35 +++++--- 3 files changed, 126 insertions(+), 63 deletions(-) diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index c73a3ffce..540f4fe93 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -120,7 +120,7 @@ template class SelfAdjointView /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this: * \f$ this = this + \alpha ( u v^* + v u^*) \f$ - * + * * The vectors \a u and \c v \b must be column vectors, however they can be * a adjoint expression without any overhead. Only the meaningful triangular * part of the matrix is updated, the rest is left unchanged. @@ -184,34 +184,67 @@ struct ei_triangular_assignment_selector -struct ei_selfadjoint_product_returntype - : public ReturnByValue, +template +struct ei_selfadjoint_product_returntype + : public ReturnByValue, Matrix::Scalar, Rhs::RowsAtCompileTime,Rhs::ColsAtCompileTime> > { - typedef typename ei_cleantype::type RhsNested; + typedef typename Lhs::Scalar Scalar; + + typedef typename Lhs::Nested LhsNested; + typedef typename ei_cleantype::type _LhsNested; + typedef ei_blas_traits<_LhsNested> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename ei_cleantype::type _ActualLhsType; + + typedef typename Rhs::Nested RhsNested; + typedef typename ei_cleantype::type _RhsNested; + typedef ei_blas_traits<_RhsNested> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename ei_cleantype::type _ActualRhsType; + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) + }; + ei_selfadjoint_product_returntype(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {} + template inline void _addTo(Dest& dst) const + { evalTo(dst,1); } + template inline void _subTo(Dest& dst) const + { evalTo(dst,-1); } + template void evalTo(Dest& dst) const { - dst.resize(m_rhs.rows(), m_rhs.cols()); - ei_product_selfadjoint_vector::Flags&RowMajorBit, - LhsMode&(UpperTriangularBit|LowerTriangularBit)> + dst.resize(m_lhs.rows(), m_rhs.cols()); + dst.setZero(); + evalTo(dst,1); + } + + template void evalTo(Dest& dst, Scalar alpha) const + { + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); + ei_product_selfadjoint_vector::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> ( - m_lhs.rows(), // size - m_lhs.data(), // lhs - m_lhs.stride(), // lhsStride, - m_rhs.data(), // rhs - // int rhsIncr, - dst.data() // res + lhs.rows(), // size + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info + &dst.coeffRef(0), // result info + actualAlpha // scale factor ); } - const typename Lhs::Nested m_lhs; - const typename Rhs::Nested m_rhs; + const LhsNested m_lhs; + const RhsNested m_rhs; }; /*************************************************************************** @@ -235,7 +268,7 @@ struct ei_selfadjoint_product_returntype typedef ei_blas_traits<_LhsNested> LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef typename ei_cleantype::type _ActualLhsType; - + typedef typename Rhs::Nested RhsNested; typedef typename ei_cleantype::type _RhsNested; typedef ei_blas_traits<_RhsNested> RhsBlasTraits; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 8ac83772c..8e0dc9526 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -30,12 +30,12 @@ * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ -template +template static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( int size, - const Scalar* lhs, int lhsStride, - const Scalar* rhs, //int rhsIncr, - Scalar* res) + const Scalar* lhs, int lhsStride, + const Scalar* _rhs, int rhsIncr, + Scalar* res, Scalar alpha) { typedef typename ei_packet_traits::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); @@ -46,8 +46,22 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( FirstTriangular = IsRowMajor == IsLower }; - ei_conj_if::IsComplex && IsRowMajor> conj0; - ei_conj_if::IsComplex && !IsRowMajor> conj1; + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + + Scalar cjAlpha = ConjugateRhs ? ei_conj(alpha) : alpha; + + // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, + // this is because we need to extract packets + const Scalar* EIGEN_RESTRICT rhs = _rhs; + if (rhsIncr!=1) + { + Scalar* r = ei_aligned_stack_new(Scalar, size); + const Scalar* it = _rhs; + for (int i=0; i huge speed up) + // gcc 4.2 does this optimization automatically. + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i(rhs), size); } diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp index 44bafad93..2cacc8e5e 100644 --- a/test/product_selfadjoint.cpp +++ b/test/product_selfadjoint.cpp @@ -31,6 +31,8 @@ template void product_selfadjoint(const MatrixType& m) typedef Matrix VectorType; typedef Matrix RowVectorType; + typedef Matrix RhsMatrixType; + int rows = m.rows(); int cols = m.cols(); @@ -38,31 +40,36 @@ template void product_selfadjoint(const MatrixType& m) m2 = MatrixType::Random(rows, cols), m3; VectorType v1 = VectorType::Random(rows), - v2 = VectorType::Random(rows); - + v2 = VectorType::Random(rows), + v3(rows); RowVectorType r1 = RowVectorType::Random(rows), r2 = RowVectorType::Random(rows); + RhsMatrixType m4 = RhsMatrixType::Random(rows,10); Scalar s1 = ei_random(), s2 = ei_random(), s3 = ei_random(); - m1 = m1.adjoint()*m1; + m1 = (m1.adjoint() + m1).eval(); // lower - m2.setZero(); - m2.template triangularView() = m1; - ei_product_selfadjoint_vector - (cols,m2.data(),cols, v1.data(), v2.data()); - VERIFY_IS_APPROX(v2, m1 * v1); - VERIFY_IS_APPROX((m2.template selfadjointView() * v1).eval(), m1 * v1); + m2 = m1.template triangularView(); + VERIFY_IS_APPROX(v3 = (s1*m2).template selfadjointView() * (s2*v1), (s1*m1) * (s2*v1)); + VERIFY_IS_APPROX(v3 = (s1*m2.conjugate()).template selfadjointView() * (s2*v1), (s1*m1.conjugate()) * (s2*v1)); + VERIFY_IS_APPROX(v3 = (s1*m2).template selfadjointView() * (s2*m4.col(1)), (s1*m1) * (s2*m4.col(1))); + + VERIFY_IS_APPROX(v3 = (s1*m2).template selfadjointView() * (s2*v1.conjugate()), (s1*m1) * (s2*v1.conjugate())); + VERIFY_IS_APPROX(v3 = (s1*m2.conjugate()).template selfadjointView() * (s2*v1.conjugate()), (s1*m1.conjugate()) * (s2*v1.conjugate())); // upper - m2.setZero(); - m2.template triangularView() = m1; - ei_product_selfadjoint_vector(cols,m2.data(),cols, v1.data(), v2.data()); - VERIFY_IS_APPROX(v2, m1 * v1); - VERIFY_IS_APPROX((m2.template selfadjointView() * v1).eval(), m1 * v1); + m2 = m1.template triangularView(); + VERIFY_IS_APPROX(v3 = (s1*m2).template selfadjointView() * (s2*v1), (s1*m1) * (s2*v1)); + VERIFY_IS_APPROX(v3 = (s1*m2.conjugate()).template selfadjointView() * (s2*v1), (s1*m1.conjugate()) * (s2*v1)); + VERIFY_IS_APPROX(v3 = (s1*m2.adjoint()).template selfadjointView() * (s2*v1), (s1*m1.adjoint()) * (s2*v1)); + VERIFY_IS_APPROX(v3 = (s1*m2.transpose()).template selfadjointView() * (s2*v1), (s1*m1.transpose()) * (s2*v1)); + + VERIFY_IS_APPROX(v3 = (s1*m2).template selfadjointView() * (s2*v1.conjugate()), (s1*m1) * (s2*v1.conjugate())); + VERIFY_IS_APPROX(v3 = (s1*m2.conjugate()).template selfadjointView() * (s2*v1.conjugate()), (s1*m1.conjugate()) * (s2*v1.conjugate())); // rank2 update m2 = m1.template triangularView();