From aa2b46aa9151bd739ba02114e1dad643a8cc5c4d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Jul 2010 16:29:29 +0200 Subject: [PATCH] allow vectorization of mat44.col() by adding a InnerPanel boolean template parameter to Block --- Eigen/src/Core/Block.h | 17 +++++++++-------- Eigen/src/Core/DenseBase.h | 12 ++++++------ Eigen/src/Core/Map.h | 2 +- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 +- Eigen/src/Core/util/ForwardDeclarations.h | 2 +- test/vectorization_logic.cpp | 12 ++++++++++++ 6 files changed, 30 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index a16aa22d4..2a28ea7cd 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -58,8 +58,8 @@ * * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock */ -template -struct ei_traits > : ei_traits +template +struct ei_traits > : ei_traits { typedef typename ei_traits::Scalar Scalar; typedef typename ei_traits::StorageKind StorageKind; @@ -92,15 +92,16 @@ struct ei_traits > : ei_tr MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % ei_packet_traits::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && ((OuterStrideAtCompileTime % ei_packet_traits::size) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, - Flags0 = ei_traits::Flags & (HereditaryBits | MaskPacketAccessBit | LvalueBit | DirectAccessBit), + Flags0 = ei_traits::Flags & (HereditaryBits | MaskPacketAccessBit | LvalueBit | DirectAccessBit | MaskAlignedBit), Flags1 = Flags0 | FlagsLinearAccessBit, Flags = (Flags1 & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0) }; }; -template class Block - : public ei_dense_xpr_base >::type +template class Block + : public ei_dense_xpr_base >::type { public: @@ -229,9 +230,9 @@ template c }; /** \internal */ -template -class Block - : public MapBase > +template +class Block + : public MapBase > { public: diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 02eb31d76..a00b791cd 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -233,17 +233,17 @@ template class DenseBase /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, ei_traits::ColsAtCompileTime, 1> EigenvaluesReturnType; /** \internal expression type of a column */ - typedef Block::RowsAtCompileTime, 1> ColXpr; + typedef Block::RowsAtCompileTime, 1, !IsRowMajor> ColXpr; /** \internal expression type of a row */ - typedef Block::ColsAtCompileTime> RowXpr; + typedef Block::ColsAtCompileTime, IsRowMajor> RowXpr; /** \internal expression type of a block of whole columns */ - typedef Block::RowsAtCompileTime, Dynamic> ColsBlockXpr; + typedef Block::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr; /** \internal expression type of a block of whole rows */ - typedef Block::ColsAtCompileTime> RowsBlockXpr; + typedef Block::ColsAtCompileTime, IsRowMajor> RowsBlockXpr; /** \internal expression type of a block of whole columns */ - template struct NColsBlockXpr { typedef Block::RowsAtCompileTime, N> Type; }; + template struct NColsBlockXpr { typedef Block::RowsAtCompileTime, N, !IsRowMajor> Type; }; /** \internal expression type of a block of whole rows */ - template struct NRowsBlockXpr { typedef Block::ColsAtCompileTime> Type; }; + template struct NRowsBlockXpr { typedef Block::ColsAtCompileTime, IsRowMajor> Type; }; #endif // not EIGEN_PARSED_BY_DOXYGEN diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 3386c6d69..763948453 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -100,7 +100,7 @@ struct ei_traits > || ( OuterStrideAtCompileTime!=Dynamic && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), Flags0 = ei_traits::Flags, - Flags1 = IsAligned ? int(Flags0) | AlignedBit : int(Flags0) & ~AlignedBit, + Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = HasNoStride ? int(Flags1) : int(Flags1 & ~LinearAccessBit), Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit) }; diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index e4ca82985..cb73fd205 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -373,7 +373,7 @@ Packet4f ei_pcos(const Packet4f& _x) return _mm_xor_ps(y, sign_bit); } -// This is Quake3's fast inverse square root. +// This is based on Quake3's fast inverse square root. // For detail see here: http://www.beyond3d.com/content/articles/8/ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ei_psqrt(const Packet4f& _x) diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index e4466db5b..423aa110e 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -60,7 +60,7 @@ template class NestByValue; template class ForceAlignedAccess; template class SwapWrapper; -template::ret> class Block; template class VectorBlock; diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 4bc7e58c5..2b23d4082 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -149,8 +149,20 @@ template::Vectorizable> VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), LinearTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), + InnerVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1), + InnerVectorizedTraversal,CompleteUnrolling)); + if(PacketSize>1) { + typedef Matrix Matrix33c; + VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), + LinearTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), + LinearTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling));