From c9b046d5d5eba6e3f454ec2a125d74a21c61d988 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jul 2008 22:30:18 +0000 Subject: [PATCH] * added optimized paths for matrix-vector and vector-matrix products (using either a cache friendly strategy or re-using dot-product vectorized implementation) * add LinearAccessBit to Transpose --- Eigen/Core | 2 +- Eigen/src/Core/Block.h | 13 ++-- Eigen/src/Core/MatrixBase.h | 2 +- Eigen/src/Core/Product.h | 125 +++++++++++++++++++++++++++++++----- Eigen/src/Core/Transpose.h | 24 ++++++- 5 files changed, 143 insertions(+), 23 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index ecfd141b2..333e80a6d 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -41,12 +41,12 @@ namespace Eigen { #include "src/Core/CwiseUnaryOp.h" #include "src/Core/CwiseNullaryOp.h" #include "src/Core/InverseProduct.h" +#include "src/Core/Dot.h" #include "src/Core/Product.h" #include "src/Core/DiagonalProduct.h" #include "src/Core/Block.h" #include "src/Core/Minor.h" #include "src/Core/Transpose.h" -#include "src/Core/Dot.h" #include "src/Core/DiagonalMatrix.h" #include "src/Core/DiagonalCoeffs.h" #include "src/Core/Sum.h" diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index d954e1a2e..245357511 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -155,7 +155,7 @@ template class Block return m_matrix.const_cast_derived() .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); - + } inline const Scalar coeff(int index) const @@ -168,20 +168,23 @@ template class Block template inline PacketScalar packet(int row, int col) const { - return m_matrix.template packet(row + m_startRow.value(), col + m_startCol.value()); + return m_matrix.template packet + (row + m_startRow.value(), col + m_startCol.value()); } template inline void writePacket(int row, int col, const PacketScalar& x) { - m_matrix.const_cast_derived().template writePacket(row + m_startRow.value(), col + m_startCol.value(), x); + m_matrix.const_cast_derived().template writePacket + (row + m_startRow.value(), col + m_startCol.value(), x); } template inline PacketScalar packet(int index) const { - return m_matrix.template packet(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_matrix.template packet + (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } template diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index aba4817ac..e8b317a2a 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -219,7 +219,7 @@ template class MatrixBase /** Overloaded for cache friendly product evaluation */ template Derived& lazyAssign(const Flagged& other) - { lazyAssign(other._expression()); } + { return lazyAssign(other._expression()); } /** Overloaded for sparse product evaluation */ template diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 0a301f983..6089c1be5 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -89,9 +89,11 @@ template struct ei_product_mode ? DiagonalProduct : (Rhs::Flags & Lhs::Flags & SparseBit) ? SparseProduct - : Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + : Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || ((Rhs::Flags&RowMajorBit) && Lhs::IsVectorAtCompileTime)) + && ( Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || ((!(Lhs::Flags&RowMajorBit)) && Rhs::IsVectorAtCompileTime)) ? CacheFriendlyProduct : NormalProduct }; }; @@ -161,7 +163,7 @@ struct ei_traits > * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. */ CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize!=Dynamic) && (InnerSize % ei_packet_traits::size == 0) + && (InnerSize % ei_packet_traits::size == 0) }; }; @@ -181,7 +183,7 @@ template class Product PacketSize = ei_packet_traits::size, InnerSize = ei_traits::InnerSize, Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - CanVectorizeInner = ei_traits::CanVectorizeInner && Unroll + CanVectorizeInner = ei_traits::CanVectorizeInner }; typedef ei_product_coeff_impl class Product /** \internal * \returns whether it is worth it to use the cache friendly product. */ - inline bool _useCacheFriendlyProduct() const { - return rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD; + inline bool _useCacheFriendlyProduct() const + { + return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + && (rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || ((_RhsNested::Flags&RowMajorBit) && _LhsNested::IsVectorAtCompileTime)) + && (cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || ((!(_LhsNested::Flags&RowMajorBit)) && _RhsNested::IsVectorAtCompileTime)); } inline int rows() const { return m_lhs.rows(); } @@ -245,6 +250,9 @@ template class Product return res; } + const _LhsNested& lhs() const { return m_lhs; } + const _RhsNested& rhs() const { return m_rhs; } + protected: const LhsNested m_lhs; const RhsNested m_rhs; @@ -324,18 +332,18 @@ struct ei_product_coeff_impl *******************************************/ template -struct ei_product_coeff_vectorized_impl +struct ei_product_coeff_vectorized_unroller { enum { PacketSize = ei_packet_traits::size }; inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) { - ei_product_coeff_vectorized_impl::run(row, col, lhs, rhs, pres); + ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); pres = ei_padd(pres, ei_pmul( lhs.template packet(row, Index) , rhs.template packet(Index, col) )); } }; template -struct ei_product_coeff_vectorized_impl<0, Lhs, Rhs, PacketScalar> +struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) { @@ -351,12 +359,59 @@ struct ei_product_coeff_impl inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { PacketScalar pres; - ei_product_coeff_vectorized_impl::run(row, col, lhs, rhs, pres); + ei_product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); ei_product_coeff_impl::run(row, col, lhs, rhs, res); res = ei_predux(pres); } }; +// FIXME the following is a hack to get very high perf with matrix-vector product, +// however, it would be preferable to switch for more general dynamic alignment queries +template +struct ei_product_coeff_vectorized_dyn_selector +{ + inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block::ColsAtCompileTime>, + Block::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col)); + } +}; + +template +struct ei_product_coeff_vectorized_dyn_selector +{ + inline static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Lhs, + Block::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col)); + } +}; + +template +struct ei_product_coeff_vectorized_dyn_selector +{ + inline static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block::ColsAtCompileTime>, + Rhs, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs); + } +}; + +template +struct ei_product_coeff_impl +{ + inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + ei_product_coeff_vectorized_dyn_selector::run(row, col, lhs, rhs, res); + } +}; + /******************* *** Packet path *** *******************/ @@ -425,6 +480,46 @@ struct ei_product_packet_impl::RowsAtCompileTime, + int LhsOrder = int(ei_traits::LhsFlags)&RowMajorBit ? RowMajor : ColMajor, + int RhsCols = ei_traits::ColsAtCompileTime, + int RhsOrder = int(ei_traits::RhsFlags)&RowMajorBit ? RowMajor : ColMajor> +struct ei_cache_friendly_product_selector +{ + template + inline static void run(DestDerived& res, const ProductType& product) + { + product._cacheFriendlyEvalAndAdd(res); + } +}; + +// optimized colmajor * vector path +template +struct ei_cache_friendly_product_selector +{ + template + inline static void run(DestDerived& res, const ProductType& product) + { + const int rows = product.rhs().rows(); + for (int j=0; j +struct ei_cache_friendly_product_selector +{ + template + inline static void run(DestDerived& res, const ProductType& product) + { + const int cols = product.lhs().cols(); + for (int j=0; j template @@ -432,7 +527,7 @@ inline Derived& MatrixBase::operator+=(const Flagged, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other) { if (other._expression()._useCacheFriendlyProduct()) - other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived()); + ei_cache_friendly_product_selector >::run(const_cast_derived(), other._expression()); else lazyAssign(derived() + other._expression()); return derived(); @@ -445,7 +540,7 @@ inline Derived& MatrixBase::lazyAssign(const Product >::run(const_cast_derived(), product); } else { diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index aa2ac44d5..dba19f025 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -49,7 +49,7 @@ struct ei_traits > MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime, Flags = ((int(_MatrixTypeNested::Flags) ^ RowMajorBit) - & ~( LinearAccessBit | LowerTriangularBit | UpperTriangularBit)) + & ~(LowerTriangularBit | UpperTriangularBit)) | (int(_MatrixTypeNested::Flags)&UpperTriangularBit ? LowerTriangularBit : 0) | (int(_MatrixTypeNested::Flags)&LowerTriangularBit ? UpperTriangularBit : 0), CoeffReadCost = _MatrixTypeNested::CoeffReadCost @@ -84,6 +84,16 @@ template class Transpose return m_matrix.coeff(col, row); } + inline const Scalar coeff(int index) const + { + return m_matrix.coeff(index); + } + + inline Scalar& coeffRef(int index) + { + return m_matrix.const_cast_derived().coeffRef(index); + } + template inline const PacketScalar packet(int row, int col) const { @@ -96,6 +106,18 @@ template class Transpose m_matrix.const_cast_derived().template writePacket(col, row, x); } + template + inline const PacketScalar packet(int index) const + { + return m_matrix.template packet(index); + } + + template + inline void writePacket(int index, const PacketScalar& x) + { + m_matrix.const_cast_derived().template writePacket(index, x); + } + protected: const typename MatrixType::Nested m_matrix; };