From 861d18d5532546ddb0cd2bff8795eda080ce0c85 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 12 Jul 2008 22:59:34 +0000 Subject: [PATCH] * Optimization: added a specialization of Block for xpr with DirectAccessBit * some simplifications and fixes in cache friendly products --- Eigen/src/Core/Block.h | 139 +++++++++++++++++++++- Eigen/src/Core/CacheFriendlyProduct.h | 99 +++++++-------- Eigen/src/Core/Product.h | 24 +--- Eigen/src/Core/util/Constants.h | 5 + Eigen/src/Core/util/ForwardDeclarations.h | 3 +- bench/benchBlasGemm.cpp | 8 +- 6 files changed, 198 insertions(+), 80 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index ae4e83c9e..e7eb87a26 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -56,8 +56,8 @@ * * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock */ -template -struct ei_traits > +template +struct ei_traits > { typedef typename MatrixType::Scalar Scalar; enum{ @@ -83,8 +83,8 @@ struct ei_traits > }; }; -template class Block - : public MatrixBase > +template class Block + : public MatrixBase > { public: @@ -203,6 +203,137 @@ template class Block const ei_int_if_dynamic m_blockCols; }; +/** \internal */ +template class Block + : public MatrixBase > +{ + enum { + IsRowMajor = int(ei_traits::Flags)&RowMajorBit ? 1 : 0 + }; + + public: + + EIGEN_GENERIC_PUBLIC_INTERFACE(Block) + + /** Column or Row constructor + */ + inline Block(const MatrixType& matrix, int i) + : m_matrix(matrix), + m_data_ptr(&matrix.const_cast_derived().coeffRef( + (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0, + (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)), + m_blockRows(matrix.rows()), + m_blockCols(matrix.cols()) + { + ei_assert( (i>=0) && ( + ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows() + && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols()); + } + + /** Dynamic-size constructor + */ + inline Block(const MatrixType& matrix, + int startRow, int startCol, + int blockRows, int blockCols) + : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), + m_blockRows(blockRows), m_blockCols(blockCols) + { + ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); + ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows() + && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols()); + } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) + + inline int rows() const { return m_blockRows.value(); } + inline int cols() const { return m_blockCols.value(); } + + inline int stride(void) const { return m_matrix.stride(); } + + inline Scalar& coeffRef(int row, int col) + { + if (IsRowMajor) + return m_data_ptr[col + row * stride()]; + else + return m_data_ptr[row + col * stride()]; + } + + inline const Scalar coeff(int row, int col) const + { +// std::cerr << "coeff(int row, int col)\n"; + if (IsRowMajor) + return m_data_ptr[col + row * stride()]; + else + return m_data_ptr[row + col * stride()]; + } + + inline Scalar& coeffRef(int index) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + return m_data_ptr[index]; + } + + inline const Scalar coeff(int index) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + if ( (RowsAtCompileTime == 1) == IsRowMajor ) + return m_data_ptr[index]; + else + return m_data_ptr[index*stride()]; + } + + template + inline PacketScalar packet(int row, int col) const + { + if (IsRowMajor) + return ei_ploadu(&m_data_ptr[col + row * stride()]); + else + return ei_ploadu(&m_data_ptr[row + col * stride()]); + } + + template + inline void writePacket(int row, int col, const PacketScalar& x) + { + if (IsRowMajor) + ei_pstoreu(&m_data_ptr[col + row * stride()], x); + else + ei_pstoreu(&m_data_ptr[row + col * stride()], x); + } + + template + inline PacketScalar packet(int index) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + return ei_ploadu(&m_data_ptr[index]); + } + + template + inline void writePacket(int index, const PacketScalar& x) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + ei_pstoreu(&m_data_ptr[index], x); + } + + protected: + + const typename MatrixType::Nested m_matrix; + Scalar* m_data_ptr; + const ei_int_if_dynamic m_blockRows; + const ei_int_if_dynamic m_blockCols; +}; + + /** \returns a dynamic-size expression of a block in *this. * * \param startRow the first row in the block diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index a710d44d4..06b3f5876 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -367,7 +367,7 @@ static void ei_cache_friendly_product( * TODO: since rhs gets evaluated only once, no need to evaluate it */ template -EIGEN_DONT_INLINE static void ei_cache_friendly_product( +EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, @@ -408,54 +408,34 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( : alignmentStep==2 ? EvenAligned : FirstAligned; - // find how many column do we have to skip to be aligned with the result (if possible) + // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns=0; - for (; skipColumns0) + if (alignedSize>alignedStart) { switch(alignmentPattern) { @@ -475,10 +455,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize); if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize); if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize); - if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize); - if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize); - if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize); - if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize); } for (int j = peeledSize; j0) + for (int i=columnBound; i } }; -// FIXME the following is a hack to get very high perf with matrix-vector product, -// however, it would be preferable to switch for more general dynamic alignment queries +// NOTE the following specializations are because taking .col(0) on a vector is a bit slower template struct ei_product_coeff_vectorized_dyn_selector { @@ -481,14 +480,9 @@ struct ei_product_packet_impl -static void ei_cache_friendly_product( +static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); -enum { - HasDirectAccess, - NoDirectAccess -}; - template::RowsAtCompileTime, int LhsOrder = int(ei_traits::LhsFlags)&RowMajorBit ? RowMajor : ColMajor, @@ -507,19 +501,13 @@ struct ei_cache_friendly_product_selector // optimized colmajor * vector path template -struct ei_cache_friendly_product_selector +struct ei_cache_friendly_product_selector { - typedef typename ei_traits::_LhsNested Lhs; template inline static void run(DestDerived& res, const ProductType& product) { - ei_scalar_sum_op _sum; const int size = product.rhs().rows(); for (int k=0; k(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows()); - else res += product.rhs().coeff(k) * product.lhs().col(k); } }; @@ -527,7 +515,7 @@ struct ei_cache_friendly_product_selector -struct ei_cache_friendly_product_selector +struct ei_cache_friendly_product_selector { typedef typename ProductType::Scalar Scalar; @@ -545,7 +533,7 @@ struct ei_cache_friendly_product_selector >(_res, res.size()) = res; } - ei_cache_friendly_product(res.size(), + ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), product.rhs(), _res); @@ -588,7 +576,7 @@ struct ei_cache_friendly_product_selector >(_res, res.size()) = res; } - ei_cache_friendly_product(res.size(), + ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), product.lhs().transpose(), _res); diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index c51aa282b..7e2d37dff 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -206,6 +206,11 @@ enum { RowMajor = RowMajorBit }; +enum { + NoDirectAccess = 0, + HasDirectAccess = DirectAccessBit +}; + const int FullyCoherentAccessPattern = 0x1; const int InnerCoherentAccessPattern = 0x2 | FullyCoherentAccessPattern; const int OuterCoherentAccessPattern = 0x4 | InnerCoherentAccessPattern; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 922a3716f..f96d57747 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -42,7 +42,8 @@ class Matrix; template class Flagged; template class NestByValue; template class Minor; -template class Block; +template::Flags&DirectAccessBit> class Block; template class Transpose; template class Conjugate; template class CwiseNullaryOp; diff --git a/bench/benchBlasGemm.cpp b/bench/benchBlasGemm.cpp index 02f067e1a..5455b6ed5 100644 --- a/bench/benchBlasGemm.cpp +++ b/bench/benchBlasGemm.cpp @@ -82,12 +82,12 @@ int main(int argc, char *argv[]) std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " check\n"; - std::cout << "Options:\n" + std::cout << "Options:\n"; std::cout << " size unique size of the 2 matrices (integer)\n"; std::cout << " auto automatically set the number of repetitions and tries\n"; - std::cout << " nbloops number of times the GEMM routines is executed\n" - std::cout << " nbtries number of times the loop is benched (return the best try)\n" - std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n" + std::cout << " nbloops number of times the GEMM routines is executed\n"; + std::cout << " nbtries number of times the loop is benched (return the best try)\n"; + std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n"; std::cout << " check check eigen product using cblas as a reference\n"; exit(1); }