diff --git a/Eigen/Core b/Eigen/Core index cd942b3c6..28ed09035 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -138,7 +138,7 @@ namespace Eigen { #endif #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif #include "src/Core/Functors.h" @@ -180,18 +180,20 @@ namespace Eigen { #include "src/Core/util/BlasUtil.h" #include "src/Core/ProductBase.h" #include "src/Core/Product.h" -#include "src/Core/products/GeneralMatrixMatrix.h" -#include "src/Core/products/GeneralMatrixVector.h" -#include "src/Core/products/SelfadjointMatrixVector.h" -#include "src/Core/products/SelfadjointMatrixMatrix.h" #include "src/Core/TriangularMatrix.h" #include "src/Core/SelfAdjointView.h" #include "src/Core/SolveTriangular.h" +#include "src/Core/products/GeneralUnrolled.h" +#include "src/Core/products/GeneralBlockPanelKernel.h" +#include "src/Core/products/GeneralMatrixVector.h" +#include "src/Core/products/GeneralMatrixMatrix.h" +#include "src/Core/products/SelfadjointMatrixVector.h" +#include "src/Core/products/SelfadjointMatrixMatrix.h" #include "src/Core/products/SelfadjointProduct.h" #include "src/Core/products/SelfadjointRank2Update.h" #include "src/Core/products/TriangularMatrixVector.h" -#include "src/Core/products/TriangularSolverMatrix.h" #include "src/Core/products/TriangularMatrixMatrix.h" +#include "src/Core/products/TriangularSolverMatrix.h" #include "src/Core/BandMatrix.h" } // namespace Eigen diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index a4ece7f0d..fe559a991 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -63,23 +63,22 @@ template struct ei_product_type Cols = Rhs::ColsAtCompileTime, Depth = EIGEN_ENUM_MIN(Lhs::ColsAtCompileTime,Rhs::RowsAtCompileTime), - value = ei_product_type_selector<(Rows>8 ? Large : (Rows==1 ? 1 : Small)), - (Cols>8 ? Large : (Cols==1 ? 1 : Small)), - (Depth>8 ? Large : (Depth==1 ? 1 : Small))>::ret + value = ei_product_type_selector<(Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1 ? 1 : Small)), + (Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1 ? 1 : Small)), + (Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1 ? 1 : Small))>::ret }; }; +/* The following allows to select the kind of product at compile time + * based on the three dimensions of the product. + * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ +// FIXME I'm not sure the current mapping is the ideal one. template struct ei_product_type_selector { enum { ret = OuterProduct }; }; template struct ei_product_type_selector<1,1,Depth> { enum { ret = InnerProduct }; }; template<> struct ei_product_type_selector<1,1,1> { enum { ret = InnerProduct }; }; template<> struct ei_product_type_selector { enum { ret = UnrolledProduct }; }; template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = UnrolledProduct }; }; template<> struct ei_product_type_selector { enum { ret = UnrolledProduct }; }; - -// template<> struct ei_product_type_selector { enum { ret = GemvProduct }; }; -// template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = GemvProduct }; }; -// template<> struct ei_product_type_selector { enum { ret = GemmProduct }; }; - template<> struct ei_product_type_selector<1,Large,Small> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1,Large,Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1,Small,Large> { enum { ret = GemvProduct }; }; @@ -129,48 +128,6 @@ struct ProductReturnType }; -/*********************************************************************** -* Implementation of General Matrix Matrix Product -***********************************************************************/ - -template -struct ei_traits > - : ei_traits, Lhs, Rhs> > -{}; - -template -class GeneralProduct - : public ProductBase, Lhs, Rhs> -{ - public: - EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) - - GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_general_matrix_matrix_product< - Scalar, - (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate), - (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit)?RowMajor:ColMajor> - ::run( - this->rows(), this->cols(), lhs.cols(), - (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), - (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), - (Scalar*)&(dst.coeffRef(0,0)), dst.stride(), - actualAlpha); - } -}; - /*********************************************************************** * Implementation of Inner Vector Vector Product ***********************************************************************/ @@ -403,362 +360,6 @@ template<> struct ei_gemv_selector } }; -/*********************************************************************** -* Implementation of products with small fixed sizes -***********************************************************************/ - -/* Since the all the dimensions of the product are small, here we can rely - * on the generic Assign mechanism to evaluate the product per coeff (or packet). - * - * Note that the here inner-loops should always be unrolled. - */ - -template -struct ei_product_coeff_impl; - -template -struct ei_product_packet_impl; - -template -struct ei_traits > -{ - typedef typename ei_cleantype::type _LhsNested; - typedef typename ei_cleantype::type _RhsNested; - typedef typename ei_scalar_product_traits::ReturnType Scalar; - - enum { - LhsCoeffReadCost = _LhsNested::CoeffReadCost, - RhsCoeffReadCost = _RhsNested::CoeffReadCost, - LhsFlags = _LhsNested::Flags, - RhsFlags = _RhsNested::Flags, - - RowsAtCompileTime = _LhsNested::RowsAtCompileTime, - ColsAtCompileTime = _RhsNested::ColsAtCompileTime, - InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), - - MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, - MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, - - LhsRowMajor = LhsFlags & RowMajorBit, - RhsRowMajor = RhsFlags & RowMajorBit, - - CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits::size) == 0), - - CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits::size) == 0), - - EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs), - - RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), - - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) - | EvalBeforeAssigningBit - | EvalBeforeNestingBit - | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0) - | (LhsFlags & RhsFlags & AlignedBit), - - CoeffReadCost = InnerSize == Dynamic ? Dynamic - : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) - + (InnerSize - 1) * NumTraits::AddCost, - - /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ - CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize % ei_packet_traits::size == 0) - }; -}; - -template class GeneralProduct - : ei_no_assignment_operator, - public MatrixBase > -{ - public: - - EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct) - - private: - - typedef typename ei_traits::_LhsNested _LhsNested; - typedef typename ei_traits::_RhsNested _RhsNested; - - enum { - PacketSize = ei_packet_traits::size, - InnerSize = ei_traits::InnerSize, - Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - CanVectorizeInner = ei_traits::CanVectorizeInner - }; - - typedef ei_product_coeff_impl ScalarCoeffImpl; - - public: - - template - inline GeneralProduct(const Lhs& lhs, const Rhs& rhs) - : m_lhs(lhs), m_rhs(rhs) - { - // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. - // We still allow to mix T and complex. - EIGEN_STATIC_ASSERT((ei_is_same_type::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - ei_assert(lhs.cols() == rhs.rows() - && "invalid matrix product" - && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); - } - - EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); } - EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); } - - EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const - { - Scalar res; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, - * which is why we don't set the LinearAccessBit. - */ - EIGEN_STRONG_INLINE const Scalar coeff(int index) const - { - Scalar res; - const int row = RowsAtCompileTime == 1 ? 0 : index; - const int col = RowsAtCompileTime == 1 ? index : 0; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - template - EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const - { - PacketScalar res; - ei_product_packet_impl - ::run(row, col, m_lhs, m_rhs, res); - return res; - } - - protected: - const LhsNested m_lhs; - const RhsNested m_rhs; -}; - - -/*************************************************************************** -* Normal product .coeff() implementation (with meta-unrolling) -***************************************************************************/ - -/************************************** -*** Scalar path - no vectorization *** -**************************************/ - -template -struct ei_product_coeff_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - ei_product_coeff_impl::run(row, col, lhs, rhs, res); - res += lhs.coeff(row, Index) * rhs.coeff(Index, col); - } -}; - -template -struct ei_product_coeff_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - res = lhs.coeff(row, 0) * rhs.coeff(0, col); - } -}; - -template -struct ei_product_coeff_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = lhs.coeff(row, 0) * rhs.coeff(0, col); - for(int i = 1; i < lhs.cols(); ++i) - res += lhs.coeff(row, i) * rhs.coeff(i, col); - } -}; - -// prevent buggy user code from causing an infinite recursion -template -struct ei_product_coeff_impl -{ - EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {} -}; - -/******************************************* -*** Scalar path with inner vectorization *** -*******************************************/ - -template -struct ei_product_coeff_vectorized_unroller -{ - enum { PacketSize = ei_packet_traits::size }; - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); - pres = ei_padd(pres, ei_pmul( lhs.template packet(row, Index) , rhs.template packet(Index, col) )); - } -}; - -template -struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - pres = ei_pmul(lhs.template packet(row, 0) , rhs.template packet(0, col)); - } -}; - -template -struct ei_product_coeff_impl -{ - typedef typename Lhs::PacketScalar PacketScalar; - enum { PacketSize = ei_packet_traits::size }; - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - PacketScalar pres; - ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); - ei_product_coeff_impl::run(row, col, lhs, rhs, res); - res = ei_predux(pres); - } -}; - -template -struct ei_product_coeff_vectorized_dyn_selector -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Block::ColsAtCompileTime>, - Block::RowsAtCompileTime, 1>, - LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col)); - } -}; - -// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower -// NOTE maybe they are now useless since we have a specialization for Block -template -struct ei_product_coeff_vectorized_dyn_selector -{ - EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Lhs, - Block::RowsAtCompileTime, 1>, - LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col)); - } -}; - -template -struct ei_product_coeff_vectorized_dyn_selector -{ - EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Block::ColsAtCompileTime>, - Rhs, - LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs); - } -}; - -template -struct ei_product_coeff_vectorized_dyn_selector -{ - EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Lhs, - Rhs, - LinearVectorization, NoUnrolling>::run(lhs, rhs); - } -}; - -template -struct ei_product_coeff_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - ei_product_coeff_vectorized_dyn_selector::run(row, col, lhs, rhs, res); - } -}; - -/******************* -*** Packet path *** -*******************/ - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - ei_product_packet_impl::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet(Index, col), res); - } -}; - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - ei_product_packet_impl::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.template packet(row, Index), ei_pset1(rhs.coeff(Index, col)), res); - } -}; - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); - } -}; - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); - } -}; - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); - for(int i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); - } -}; - -template -struct ei_product_packet_impl -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); - for(int i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(lhs.template packet(row, i), ei_pset1(rhs.coeff(i, col)), res); - } -}; - /*************************************************************************** * Implementation of matrix base methods ***************************************************************************/ diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 883edd165..95ce666c9 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -196,101 +196,6 @@ struct ei_triangular_assignment_selector -struct ei_traits > - : ei_traits, Lhs, Rhs> > -{}; - -template -struct SelfadjointProductMatrix - : public ProductBase, Lhs, Rhs > -{ - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - enum { - LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) - }; - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); - ei_product_selfadjoint_vector::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> - ( - lhs.rows(), // size - &lhs.coeff(0,0), lhs.stride(), // lhs info - &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info - &dst.coeffRef(0), // result info - actualAlpha // scale factor - ); - } -}; - -/*************************************************************************** -* Wrapper to ei_product_selfadjoint_matrix -***************************************************************************/ - -template -struct ei_traits > - : ei_traits, Lhs, Rhs> > -{}; - -template -struct SelfadjointProductMatrix - : public ProductBase, Lhs, Rhs > -{ - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - enum { - LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit), - LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit, - RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit), - RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit - }; - - template void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, - NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)), - EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular, - ei_traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, - NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)), - ei_traits::Flags&RowMajorBit ? RowMajor : ColMajor> - ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeff(0,0), lhs.stride(), // lhs info - &rhs.coeff(0,0), rhs.stride(), // rhs info - &dst.coeffRef(0,0), dst.stride(), // result info - actualAlpha // alpha - ); - } -}; - /*************************************************************************** * Implementation of MatrixBase methods ***************************************************************************/ diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..129fd86e7 --- /dev/null +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,443 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + +#ifndef EIGEN_EXTERN_INSTANTIATIONS + +// optimized GEneral packed Block * packed Panel product kernel +template +struct ei_gebp_kernel +{ + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) + { + typedef typename ei_packet_traits::type PacketType; + enum { PacketSize = ei_packet_traits::size }; + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + Conj cj; + int packet_cols = (cols/nr) * nr; + const int peeled_mc = (rows/mr)*mr; + // loops on each cache friendly block of the result/rhs + for(int j2=0; j2 do the same but with nr==1 + for(int j2=packet_cols; j2 +struct ei_gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + ei_conj_if::IsComplex && Conjugate> cj; + ei_const_blas_data_mapper lhs(_lhs,lhsStride); + int count = 0; + const int peeled_mc = (rows/mr)*mr; + for(int i=0; i +struct ei_gemm_pack_rhs +{ + enum { PacketSize = ei_packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; + int count = 0; + for(int j2=0; j2 +struct ei_gemm_pack_rhs +{ + enum { PacketSize = ei_packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; + int count = 0; + for(int j2=0; j2 simple transposition of the product */ template< typename Scalar, int LhsStorageOrder, bool ConjugateLhs, @@ -51,6 +52,8 @@ struct ei_general_matrix_matrix_product Blocking algorithm following Goto's paper */ template< typename Scalar, int LhsStorageOrder, bool ConjugateLhs, @@ -78,23 +81,30 @@ static void run(int rows, int cols, int depth, Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // => GEMM_VAR1 + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) for(int k2=0; k2 Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. ei_gemm_pack_rhs()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); - // => GEPP_VAR1 + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) for(int i2=0; i2()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + // Everything is packed, we can now call the block * panel kernel: ei_gebp_kernel >() (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } @@ -106,417 +116,49 @@ static void run(int rows, int cols, int depth, }; -// optimized GEneral packed Block * packed Panel product kernel -template -struct ei_gebp_kernel -{ - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) - { - typedef typename ei_packet_traits::type PacketType; - enum { PacketSize = ei_packet_traits::size }; - if(strideA==-1) strideA = depth; - if(strideB==-1) strideB = depth; - Conj cj; - int packet_cols = (cols/nr) * nr; - const int peeled_mc = (rows/mr)*mr; - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2 do the same but with nr==1 - for(int j2=packet_cols; j2 -struct ei_gemm_pack_lhs -{ - void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - ei_conj_if::IsComplex && Conjugate> cj; - ei_const_blas_data_mapper lhs(_lhs,lhsStride); - int count = 0; - const int peeled_mc = (rows/mr)*mr; - for(int i=0; i -struct ei_gemm_pack_rhs -{ - enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - bool hasAlpha = alpha != Scalar(1); - int packet_cols = (cols/nr) * nr; - int count = 0; - for(int j2=0; j2 -struct ei_gemm_pack_rhs -{ - enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - bool hasAlpha = alpha != Scalar(1); - int packet_cols = (cols/nr) * nr; - int count = 0; - for(int j2=0; j2 for "large" GEMM, i.e., +* implementation of the high level wrapper to ei_general_matrix_matrix_product +**********************************************************************************/ + +template +struct ei_traits > + : ei_traits, Lhs, Rhs> > +{}; + +template +class GeneralProduct + : public ProductBase, Lhs, Rhs> +{ + public: + EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void addTo(Dest& dst, Scalar alpha) const + { + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_general_matrix_matrix_product< + Scalar, + (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate), + (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit)?RowMajor:ColMajor> + ::run( + this->rows(), this->cols(), lhs.cols(), + (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), + (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), + (Scalar*)&(dst.coeffRef(0,0)), dst.stride(), + actualAlpha); + } +}; + #endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/GeneralUnrolled.h b/Eigen/src/Core/products/GeneralUnrolled.h new file mode 100644 index 000000000..7241976a8 --- /dev/null +++ b/Eigen/src/Core/products/GeneralUnrolled.h @@ -0,0 +1,385 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2006-2008 Benoit Jacob +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_UNROLLED_PRODUCT_H +#define EIGEN_GENERAL_UNROLLED_PRODUCT_H + +/********************************************************************************* +* Specialization of GeneralProduct<> for products with small fixed sizes +*********************************************************************************/ + +/* Since the all the dimensions of the product are small, here we can rely + * on the generic Assign mechanism to evaluate the product per coeff (or packet). + * + * Note that here the inner-loops should always be unrolled. + */ + +template +struct ei_product_coeff_impl; + +template +struct ei_product_packet_impl; + +template +struct ei_traits > +{ + typedef typename ei_cleantype::type _LhsNested; + typedef typename ei_cleantype::type _RhsNested; + typedef typename ei_scalar_product_traits::ReturnType Scalar; + + enum { + LhsCoeffReadCost = _LhsNested::CoeffReadCost, + RhsCoeffReadCost = _RhsNested::CoeffReadCost, + LhsFlags = _LhsNested::Flags, + RhsFlags = _RhsNested::Flags, + + RowsAtCompileTime = _LhsNested::RowsAtCompileTime, + ColsAtCompileTime = _RhsNested::ColsAtCompileTime, + InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), + + MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, + MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, + + LhsRowMajor = LhsFlags & RowMajorBit, + RhsRowMajor = RhsFlags & RowMajorBit, + + CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) + && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits::size) == 0), + + CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) + && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits::size) == 0), + + EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs), + + RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), + + Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) + | EvalBeforeAssigningBit + | EvalBeforeNestingBit + | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0) + | (LhsFlags & RhsFlags & AlignedBit), + + CoeffReadCost = InnerSize == Dynamic ? Dynamic + : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + + (InnerSize - 1) * NumTraits::AddCost, + + /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ + CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) + && (InnerSize % ei_packet_traits::size == 0) + }; +}; + +template class GeneralProduct + : ei_no_assignment_operator, + public MatrixBase > +{ + public: + + EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct) + + private: + + typedef typename ei_traits::_LhsNested _LhsNested; + typedef typename ei_traits::_RhsNested _RhsNested; + + enum { + PacketSize = ei_packet_traits::size, + InnerSize = ei_traits::InnerSize, + Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, + CanVectorizeInner = ei_traits::CanVectorizeInner + }; + + typedef ei_product_coeff_impl ScalarCoeffImpl; + + public: + + template + inline GeneralProduct(const Lhs& lhs, const Rhs& rhs) + : m_lhs(lhs), m_rhs(rhs) + { + // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. + // We still allow to mix T and complex. + EIGEN_STATIC_ASSERT((ei_is_same_type::ret), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + ei_assert(lhs.cols() == rhs.rows() + && "invalid matrix product" + && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); + } + + EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); } + EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); } + + EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const + { + Scalar res; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, + * which is why we don't set the LinearAccessBit. + */ + EIGEN_STRONG_INLINE const Scalar coeff(int index) const + { + Scalar res; + const int row = RowsAtCompileTime == 1 ? 0 : index; + const int col = RowsAtCompileTime == 1 ? index : 0; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + template + EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const + { + PacketScalar res; + ei_product_packet_impl + ::run(row, col, m_lhs, m_rhs, res); + return res; + } + + protected: + const LhsNested m_lhs; + const RhsNested m_rhs; +}; + + +/*************************************************************************** +* Normal product .coeff() implementation (with meta-unrolling) +***************************************************************************/ + +/************************************** +*** Scalar path - no vectorization *** +**************************************/ + +template +struct ei_product_coeff_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + ei_product_coeff_impl::run(row, col, lhs, rhs, res); + res += lhs.coeff(row, Index) * rhs.coeff(Index, col); + } +}; + +template +struct ei_product_coeff_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + } +}; + +template +struct ei_product_coeff_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + for(int i = 1; i < lhs.cols(); ++i) + res += lhs.coeff(row, i) * rhs.coeff(i, col); + } +}; + +// prevent buggy user code from causing an infinite recursion +template +struct ei_product_coeff_impl +{ + EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {} +}; + +/******************************************* +*** Scalar path with inner vectorization *** +*******************************************/ + +template +struct ei_product_coeff_vectorized_unroller +{ + enum { PacketSize = ei_packet_traits::size }; + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); + pres = ei_padd(pres, ei_pmul( lhs.template packet(row, Index) , rhs.template packet(Index, col) )); + } +}; + +template +struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + pres = ei_pmul(lhs.template packet(row, 0) , rhs.template packet(0, col)); + } +}; + +template +struct ei_product_coeff_impl +{ + typedef typename Lhs::PacketScalar PacketScalar; + enum { PacketSize = ei_packet_traits::size }; + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + PacketScalar pres; + ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); + ei_product_coeff_impl::run(row, col, lhs, rhs, res); + res = ei_predux(pres); + } +}; + +template +struct ei_product_coeff_vectorized_dyn_selector +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block::ColsAtCompileTime>, + Block::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col)); + } +}; + +// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower +// NOTE maybe they are now useless since we have a specialization for Block +template +struct ei_product_coeff_vectorized_dyn_selector +{ + EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Lhs, + Block::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col)); + } +}; + +template +struct ei_product_coeff_vectorized_dyn_selector +{ + EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block::ColsAtCompileTime>, + Rhs, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs); + } +}; + +template +struct ei_product_coeff_vectorized_dyn_selector +{ + EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Lhs, + Rhs, + LinearVectorization, NoUnrolling>::run(lhs, rhs); + } +}; + +template +struct ei_product_coeff_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + ei_product_coeff_vectorized_dyn_selector::run(row, col, lhs, rhs, res); + } +}; + +/******************* +*** Packet path *** +*******************/ + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + ei_product_packet_impl::run(row, col, lhs, rhs, res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet(Index, col), res); + } +}; + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + ei_product_packet_impl::run(row, col, lhs, rhs, res); + res = ei_pmadd(lhs.template packet(row, Index), ei_pset1(rhs.coeff(Index, col)), res); + } +}; + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + } +}; + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); + } +}; + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + for(int i = 1; i < lhs.cols(); ++i) + res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); + } +}; + +template +struct ei_product_packet_impl +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); + for(int i = 1; i < lhs.cols(); ++i) + res = ei_pmadd(lhs.template packet(row, i), ei_pset1(rhs.coeff(i, col)), res); + } +}; + +#endif // EIGEN_GENERAL_UNROLLED_PRODUCT_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index acd239d28..1e92ada27 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -339,4 +339,56 @@ struct ei_product_selfadjoint_matrix +struct ei_traits > + : ei_traits, Lhs, Rhs> > +{}; + +template +struct SelfadjointProductMatrix + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit), + LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit, + RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit), + RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit + }; + + template void addTo(Dest& dst, Scalar alpha) const + { + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)), + EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular, + ei_traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)), + ei_traits::Flags&RowMajorBit ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), // sizes + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0,0), rhs.stride(), // rhs info + &dst.coeffRef(0,0), dst.stride(), // result info + actualAlpha // alpha + ); + } +}; + #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 279836f8a..f0004cdb9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -154,5 +154,48 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( ei_aligned_stack_delete(Scalar, const_cast(rhs), size); } +/*************************************************************************** +* Wrapper to ei_product_selfadjoint_vector +***************************************************************************/ + +template +struct ei_traits > + : ei_traits, Lhs, Rhs> > +{}; + +template +struct SelfadjointProductMatrix + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void addTo(Dest& dst, Scalar alpha) const + { + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); + ei_product_selfadjoint_vector::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> + ( + lhs.rows(), // size + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info + &dst.coeffRef(0), // result info + actualAlpha // scale factor + ); + } +}; + #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 216f2dd69..f4690c0ca 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -139,7 +139,7 @@ struct ei_product_blocking_traits // register block size along the N direction (must be either 2 or 4) nr = HalfRegisterCount/2, - // register block size along the M direction (this cannot be modified) + // register block size along the M direction (currently, this one cannot be modified) mr = 2 * PacketSize, // max cache block size along the K direction diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index ae79c2985..67747d94f 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -144,7 +144,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // const Scalar dst_var = dst_demean.rowwise().squaredNorm().sum() * one_over_n; // Eq. (38) - const MatrixType sigma = (dst_demean * src_demean.transpose() * one_over_n).lazy(); + const MatrixType sigma = (one_over_n * dst_demean * src_demean.transpose()).lazy(); SVD svd(sigma); diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index a4f75e384..914dbaf74 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -148,7 +148,6 @@ template void transformations(void) Transform3 tmat3(mat3), tmat4(mat4); if(Mode!=int(AffineCompact)) tmat4.matrix()(3,3) = Scalar(1); - std::cerr << tmat3.matrix() << "\n\n" << tmat4.matrix() << "\n\n"; VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix()); Scalar a3 = ei_random(-Scalar(M_PI), Scalar(M_PI)); diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index ee14ae07b..a96bb23da 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -65,7 +65,12 @@ template void nomalloc(const MatrixType& m) VERIFY_IS_APPROX((m1+m2)*s1, s1*m1+s1*m2); VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c))); VERIFY_IS_APPROX(m1.cwise() * m1.block(0,0,rows,cols), m1.cwise() * m1); - VERIFY_IS_APPROX((m1*m1.transpose())*m2, m1*(m1.transpose()*m2)); + if (MatrixType::RowsAtCompileTime