* Big change in Block and Map:

- added a MapBase base xpr on top of which Map and the specialization
    of Block are implemented
  - MapBase forces both aligned loads (and aligned stores, see below) in expressions
    such as "x.block(...) += other_expr"
* Significant vectorization improvement:
 - added a AlignedBit flag meaning the first coeff/packet is aligned,
   this allows to not generate extra code to deal with the first unaligned part
 - removed all unaligned stores when no unrolling
 - removed unaligned loads in Sum when the input as the DirectAccessBit flag
* Some code simplification in CacheFriendly product
* Some minor documentation improvements
This commit is contained in:
Gael Guennebaud 2008-08-09 18:41:24 +00:00
parent becbeda50a
commit 4fa40367e9
17 changed files with 397 additions and 296 deletions

View File

@ -45,6 +45,8 @@ namespace Eigen {
#include "src/Core/Product.h" #include "src/Core/Product.h"
#include "src/Core/DiagonalProduct.h" #include "src/Core/DiagonalProduct.h"
#include "src/Core/InverseProduct.h" #include "src/Core/InverseProduct.h"
#include "src/Core/MapBase.h"
#include "src/Core/Map.h"
#include "src/Core/Block.h" #include "src/Core/Block.h"
#include "src/Core/Minor.h" #include "src/Core/Minor.h"
#include "src/Core/Transpose.h" #include "src/Core/Transpose.h"
@ -54,7 +56,6 @@ namespace Eigen {
#include "src/Core/Redux.h" #include "src/Core/Redux.h"
#include "src/Core/Visitor.h" #include "src/Core/Visitor.h"
#include "src/Core/Fuzzy.h" #include "src/Core/Fuzzy.h"
#include "src/Core/Map.h"
#include "src/Core/IO.h" #include "src/Core/IO.h"
#include "src/Core/Swap.h" #include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h" #include "src/Core/CommaInitializer.h"

View File

@ -34,6 +34,13 @@
template <typename Derived, typename OtherDerived> template <typename Derived, typename OtherDerived>
struct ei_assign_traits struct ei_assign_traits
{ {
public:
enum {
DstIsAligned = Derived::Flags & AlignedBit,
SrcIsAligned = OtherDerived::Flags & AlignedBit,
SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
};
private: private:
enum { enum {
InnerSize = int(Derived::Flags)&RowMajorBit InnerSize = int(Derived::Flags)&RowMajorBit
@ -48,7 +55,8 @@ private:
enum { enum {
MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit) MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit)
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)), && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0, MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
&& int(DstIsAligned) && int(SrcIsAligned),
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit), MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
MaySliceVectorize = MightVectorize && int(InnerMaxSize)==Dynamic /* slice vectorization can be slow, so we only MaySliceVectorize = MightVectorize && int(InnerMaxSize)==Dynamic /* slice vectorization can be slow, so we only
want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
@ -79,7 +87,7 @@ public:
: int(NoUnrolling) : int(NoUnrolling)
) )
: int(Vectorization) == int(LinearVectorization) : int(Vectorization) == int(LinearVectorization)
? ( int(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) ? ( int(MayUnrollCompletely) && int(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
: int(NoUnrolling) : int(NoUnrolling)
}; };
}; };
@ -154,7 +162,7 @@ struct ei_assign_innervec_CompleteUnrolling
inline static void run(Derived1 &dst, const Derived2 &src) inline static void run(Derived1 &dst, const Derived2 &src)
{ {
dst.template copyPacket<Derived2, Aligned>(row, col, src); dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src); Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
} }
@ -173,7 +181,7 @@ struct ei_assign_innervec_InnerUnrolling
{ {
const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index; const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index;
const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col; const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
dst.template copyPacket<Derived2, Aligned>(row, col, src); dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
ei_assign_innervec_InnerUnrolling<Derived1, Derived2, ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col); Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col);
} }
@ -256,9 +264,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
for(int i = 0; i < innerSize; i+=packetSize) for(int i = 0; i < innerSize; i+=packetSize)
{ {
if(int(Derived1::Flags)&RowMajorBit) if(int(Derived1::Flags)&RowMajorBit)
dst.template copyPacket<Derived2, Aligned>(j, i, src); dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src);
else else
dst.template copyPacket<Derived2, Aligned>(i, j, src); dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src);
} }
} }
}; };
@ -298,14 +306,19 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
{ {
const int size = dst.size(); const int size = dst.size();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize; const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
: ei_alignmentOffset(&dst.coeffRef(0), size);
const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
for(int index = 0; index < alignedSize; index += packetSize) for(int index = 0; index < alignedStart; index++)
dst.copyCoeff(index, src);
for(int index = alignedStart; index < alignedEnd; index += packetSize)
{ {
dst.template copyPacket<Derived2, Aligned>(index, src); dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
} }
for(int index = alignedSize; index < size; index++) for(int index = alignedEnd; index < size; index++)
dst.copyCoeff(index, src); dst.copyCoeff(index, src);
} }
}; };
@ -334,29 +347,45 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
static void run(Derived1 &dst, const Derived2 &src) static void run(Derived1 &dst, const Derived2 &src)
{ {
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int packetAlignedMask = packetSize - 1;
const int innerSize = dst.innerSize(); const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
const int alignedInnerSize = (innerSize/packetSize)*packetSize; const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask;
int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
: ei_alignmentOffset(&dst.coeffRef(0), innerSize);
for(int i = 0; i < outerSize; i++) for(int i = 0; i < outerSize; i++)
{ {
// do the vectorizable part of the assignment const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
for (int index = 0; index<alignedInnerSize ; index+=packetSize)
{
if(Derived1::Flags&RowMajorBit)
dst.template copyPacket<Derived2, Unaligned>(i, index, src);
else
dst.template copyPacket<Derived2, Unaligned>(index, i, src);
}
// do the non-vectorizable part of the assignment // do the non-vectorizable part of the assignment
for (int index = alignedInnerSize; index<innerSize ; index++) for (int index = 0; index<alignedStart ; index++)
{ {
if(Derived1::Flags&RowMajorBit) if(Derived1::Flags&RowMajorBit)
dst.copyCoeff(i, index, src); dst.copyCoeff(i, index, src);
else else
dst.copyCoeff(index, i, src); dst.copyCoeff(index, i, src);
} }
// do the vectorizable part of the assignment
for (int index = alignedStart; index<alignedEnd; index+=packetSize)
{
if(Derived1::Flags&RowMajorBit)
dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src);
else
dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src);
}
// do the non-vectorizable part of the assignment
for (int index = alignedEnd; index<innerSize ; index++)
{
if(Derived1::Flags&RowMajorBit)
dst.copyCoeff(i, index, src);
else
dst.copyCoeff(index, i, src);
}
alignedStart = (alignedStart+alignedStep)%packetSize;
} }
} }
}; };

View File

@ -33,6 +33,8 @@
* \param MatrixType the type of the object in which we are taking a block * \param MatrixType the type of the object in which we are taking a block
* \param BlockRows the number of rows of the block we are taking at compile time (optional) * \param BlockRows the number of rows of the block we are taking at compile time (optional)
* \param BlockCols the number of columns of the block we are taking at compile time (optional) * \param BlockCols the number of columns of the block we are taking at compile time (optional)
* \param _PacketAccess
* \param _DirectAccessStatus \internal used for partial specialization
* *
* This class represents an expression of either a fixed-size or dynamic-size block. It is the return * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
* type of MatrixBase::block(int,int,int,int) and MatrixBase::block<int,int>(int,int) and * type of MatrixBase::block(int,int,int,int) and MatrixBase::block<int,int>(int,int) and
@ -56,8 +58,8 @@
* *
* \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock
*/ */
template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> template<typename MatrixType, int BlockRows, int BlockCols, int _PacketAccess, int _DirectAccessStatus>
struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus> >
{ {
typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::Scalar Scalar;
enum{ enum{
@ -74,17 +76,21 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
RowMajor = int(MatrixType::Flags)&RowMajorBit, RowMajor = int(MatrixType::Flags)&RowMajorBit,
InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime, InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime, InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0) MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize >= ei_packet_traits<Scalar>::size))
? PacketAccessBit : 0, ? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit) Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
| FlagsLinearAccessBit, | FlagsLinearAccessBit,
CoeffReadCost = MatrixType::CoeffReadCost CoeffReadCost = MatrixType::CoeffReadCost,
PacketAccess = _PacketAccess
}; };
typedef typename ei_meta_if<int(PacketAccess)==Aligned,
Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus>&,
Block<MatrixType, BlockRows, BlockCols, Aligned, _DirectAccessStatus> >::ret AlignedDerivedType;
}; };
template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> class Block template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, int _DirectAccessStatus> class Block
: public MatrixBase<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, PacketAccess, _DirectAccessStatus> >
{ {
public: public:
@ -205,26 +211,36 @@ template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatu
}; };
/** \internal */ /** \internal */
template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess> template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess>
: public MatrixBase<Block<MatrixType, BlockRows, BlockCols,HasDirectAccess> > class Block<MatrixType,BlockRows,BlockCols,PacketAccess,HasDirectAccess>
: public MapBase<Block<MatrixType, BlockRows, BlockCols,PacketAccess,HasDirectAccess> >
{ {
enum {
IsRowMajor = int(ei_traits<MatrixType>::Flags)&RowMajorBit ? 1 : 0
};
public: public:
EIGEN_GENERIC_PUBLIC_INTERFACE(Block) _EIGEN_GENERIC_PUBLIC_INTERFACE(Block, MapBase<Block>)
typedef typename ei_traits<Block>::AlignedDerivedType AlignedDerivedType;
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
AlignedDerivedType allowAligned()
{
if (PacketAccess==Aligned)
return *this;
else
return Block<MatrixType,BlockRows,BlockCols,Aligned,HasDirectAccess>
(m_matrix, Base::m_data, Base::m_rows.value(), Base::m_cols.value());
}
/** Column or Row constructor /** Column or Row constructor
*/ */
inline Block(const MatrixType& matrix, int i) inline Block(const MatrixType& matrix, int i)
: m_matrix(matrix), : Base(&matrix.const_cast_derived().coeffRef(
m_data_ptr(&matrix.const_cast_derived().coeffRef( (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0,
(BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0, (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
(BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)), BlockRows==1 ? 1 : matrix.rows(),
m_blockRows(matrix.rows()), BlockCols==1 ? 1 : matrix.cols()),
m_blockCols(matrix.cols()) m_matrix(matrix)
{ {
ei_assert( (i>=0) && ( ei_assert( (i>=0) && (
((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows()) ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows())
@ -234,13 +250,10 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
/** Fixed-size constructor /** Fixed-size constructor
*/ */
inline Block(const MatrixType& matrix, int startRow, int startCol) inline Block(const MatrixType& matrix, int startRow, int startCol)
: m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol)), m_matrix(matrix)
m_blockRows(matrix.rows()), m_blockCols(matrix.cols())
{ {
EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic,this_method_is_only_for_fixed_size);
ei_assert(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic);
ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows() ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows()
&& startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols()); && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols());
} }
/** Dynamic-size constructor /** Dynamic-size constructor
@ -248,91 +261,25 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
inline Block(const MatrixType& matrix, inline Block(const MatrixType& matrix,
int startRow, int startCol, int startRow, int startCol,
int blockRows, int blockCols) int blockRows, int blockCols)
: m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol), blockRows, blockCols),
m_blockRows(blockRows), m_blockCols(blockCols) m_matrix(matrix)
{ {
ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows) ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
&& (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows() ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows()
&& startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols()); && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols());
} }
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
inline int rows() const { return m_blockRows.value(); }
inline int cols() const { return m_blockCols.value(); }
inline int stride(void) const { return m_matrix.stride(); } inline int stride(void) const { return m_matrix.stride(); }
inline Scalar& coeffRef(int row, int col)
{
if (IsRowMajor)
return m_data_ptr[col + row * stride()];
else
return m_data_ptr[row + col * stride()];
}
inline const Scalar coeff(int row, int col) const
{
if (IsRowMajor)
return m_data_ptr[col + row * stride()];
else
return m_data_ptr[row + col * stride()];
}
inline Scalar& coeffRef(int index)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
return m_data_ptr[index];
}
inline const Scalar coeff(int index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
if ( (RowsAtCompileTime == 1) == IsRowMajor )
return m_data_ptr[index];
else
return m_data_ptr[index*stride()];
}
template<int LoadMode>
inline PacketScalar packet(int row, int col) const
{
if (IsRowMajor)
return ei_ploadu(&m_data_ptr[col + row * stride()]);
else
return ei_ploadu(&m_data_ptr[row + col * stride()]);
}
template<int LoadMode>
inline void writePacket(int row, int col, const PacketScalar& x)
{
if (IsRowMajor)
ei_pstoreu(&m_data_ptr[col + row * stride()], x);
else
ei_pstoreu(&m_data_ptr[row + col * stride()], x);
}
template<int LoadMode>
inline PacketScalar packet(int index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
return ei_ploadu(&m_data_ptr[index]);
}
template<int LoadMode>
inline void writePacket(int index, const PacketScalar& x)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
ei_pstoreu(&m_data_ptr[index], x);
}
protected: protected:
/** \internal used by allowAligned() */
inline Block(const MatrixType& matrix, const Scalar* data, int blockRows, int blockCols)
: Base(data, blockRows, blockCols), m_matrix(matrix)
{}
const typename MatrixType::Nested m_matrix; const typename MatrixType::Nested m_matrix;
Scalar* m_data_ptr;
const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows;
const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
}; };
/** \returns a dynamic-size expression of a block in *this. /** \returns a dynamic-size expression of a block in *this.

View File

@ -420,15 +420,18 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
} }
int offset1 = (FirstAligned && alignmentStep==1?3:1);
int offset3 = (FirstAligned && alignmentStep==1?1:3);
int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
{ {
Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+1]), Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+offset1]),
ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]); ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]);
// this helps a lot generating better binary code // this helps a lot generating better binary code
const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (PacketSize>1) if (PacketSize>1)
{ {
@ -453,12 +456,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
if(peels>1) if(peels>1)
{ {
Packet A00, A01, A02, A03, A10, A11, A12, A13; Packet A00, A01, A02, A03, A10, A11, A12, A13;
if (alignmentStep==1)
{
A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00;
const Scalar* aux = lhs1;
lhs1 = lhs3; lhs3 = aux;
}
A01 = ei_pload(&lhs1[alignedStart-1]); A01 = ei_pload(&lhs1[alignedStart-1]);
A02 = ei_pload(&lhs2[alignedStart-2]); A02 = ei_pload(&lhs2[alignedStart-2]);
@ -614,14 +611,17 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
} }
int offset1 = (FirstAligned && alignmentStep==1?3:1);
int offset3 = (FirstAligned && alignmentStep==1?1:3);
int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (int i=skipRows; i<rowBound; i+=rowsAtOnce) for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
{ {
Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
// this helps the compiler generating good binary code // this helps the compiler generating good binary code
const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (PacketSize>1) if (PacketSize>1)
{ {
@ -658,13 +658,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
* than basic unaligned loads. * than basic unaligned loads.
*/ */
Packet A01, A02, A03, b, A11, A12, A13; Packet A01, A02, A03, b, A11, A12, A13;
if (alignmentStep==1)
{
// flip row #1 and #3
b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
const Scalar* aux = lhs1;
lhs1 = lhs3; lhs3 = aux;
}
A01 = ei_pload(&lhs1[alignedStart-1]); A01 = ei_pload(&lhs1[alignedStart-1]);
A02 = ei_pload(&lhs2[alignedStart-2]); A02 = ei_pload(&lhs2[alignedStart-2]);
A03 = ei_pload(&lhs3[alignedStart-3]); A03 = ei_pload(&lhs3[alignedStart-3]);
@ -690,13 +683,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
ptmp2 = ei_pmadd(b, A12, ptmp2); ptmp2 = ei_pmadd(b, A12, ptmp2);
ptmp3 = ei_pmadd(b, A13, ptmp3); ptmp3 = ei_pmadd(b, A13, ptmp3);
} }
if (alignmentStep==1)
{
// restore rows #1 and #3
b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
const Scalar* aux = lhs1;
lhs1 = lhs3; lhs3 = aux;
}
} }
for (int j = peeledSize; j<alignedSize; j+=PacketSize) for (int j = peeledSize; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,u,); _EIGEN_ACCUMULATE_PACKETS(,u,u,);
@ -720,7 +706,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
Scalar b = rhs[j]; Scalar b = rhs[j];
tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
} }
res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3; res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3;
} }
// process remaining first and last rows (at most columnsAtOnce-1) // process remaining first and last rows (at most columnsAtOnce-1)

View File

@ -298,22 +298,22 @@ inline void MatrixBase<Derived>::copyCoeff(int index, const MatrixBase<OtherDeri
} }
template<typename Derived> template<typename Derived>
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
inline void MatrixBase<Derived>::copyPacket(int row, int col, const MatrixBase<OtherDerived>& other) inline void MatrixBase<Derived>::copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
{ {
ei_internal_assert(row >= 0 && row < rows() ei_internal_assert(row >= 0 && row < rows()
&& col >= 0 && col < cols()); && col >= 0 && col < cols());
derived().template writePacket<LoadStoreMode>(row, col, derived().template writePacket<StoreMode>(row, col,
other.derived().template packet<LoadStoreMode>(row, col)); other.derived().template packet<LoadMode>(row, col));
} }
template<typename Derived> template<typename Derived>
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
inline void MatrixBase<Derived>::copyPacket(int index, const MatrixBase<OtherDerived>& other) inline void MatrixBase<Derived>::copyPacket(int index, const MatrixBase<OtherDerived>& other)
{ {
ei_internal_assert(index >= 0 && index < size()); ei_internal_assert(index >= 0 && index < size());
derived().template writePacket<LoadStoreMode>(index, derived().template writePacket<StoreMode>(index,
other.derived().template packet<LoadStoreMode>(index)); other.derived().template packet<LoadMode>(index));
} }
#endif // EIGEN_COEFFS_H #endif // EIGEN_COEFFS_H

View File

@ -67,7 +67,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
MaxColsAtCompileTime = Lhs::MaxColsAtCompileTime, MaxColsAtCompileTime = Lhs::MaxColsAtCompileTime,
Flags = (int(LhsFlags) | int(RhsFlags)) & ( Flags = (int(LhsFlags) | int(RhsFlags)) & (
HereditaryBits HereditaryBits
| (int(LhsFlags) & int(RhsFlags) & LinearAccessBit) | (int(LhsFlags) & int(RhsFlags) & (LinearAccessBit | AlignedBit))
| (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit)) | (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)), ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost

View File

@ -55,7 +55,7 @@ struct ei_traits<CwiseUnaryOp<UnaryOp, MatrixType> >
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
Flags = (MatrixTypeFlags & ( Flags = (MatrixTypeFlags & (
HereditaryBits | LinearAccessBit HereditaryBits | LinearAccessBit | AlignedBit
| (ei_functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))), | (ei_functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))),
CoeffReadCost = MatrixTypeCoeffReadCost + ei_functor_traits<UnaryOp>::Cost CoeffReadCost = MatrixTypeCoeffReadCost + ei_functor_traits<UnaryOp>::Cost
}; };

View File

@ -123,7 +123,9 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index, row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0, col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index, row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0 col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0,
alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
}; };
typedef typename Derived1::Scalar Scalar; typedef typename Derived1::Scalar Scalar;
@ -131,7 +133,7 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
inline static PacketScalar run(const Derived1& v1, const Derived2& v2) inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
{ {
return ei_pmul(v1.template packet<Aligned>(row1, col1), v2.template packet<Aligned>(row2, col2)); return ei_pmul(v1.template packet<alignment1>(row1, col1), v2.template packet<alignment2>(row2, col2));
} }
}; };
@ -175,20 +177,22 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
const int size = v1.size(); const int size = v1.size();
const int packetSize = ei_packet_traits<Scalar>::size; const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize; const int alignedSize = (size/packetSize)*packetSize;
const int alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned;
const int alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned;
Scalar res; Scalar res;
// do the vectorizable part of the sum // do the vectorizable part of the sum
if(size >= packetSize) if(size >= packetSize)
{ {
PacketScalar packet_res = ei_pmul( PacketScalar packet_res = ei_pmul(
v1.template packet<Aligned>(0), v1.template packet<alignment1>(0),
v2.template packet<Aligned>(0) v2.template packet<alignment2>(0)
); );
for(int index = packetSize; index<alignedSize; index += packetSize) for(int index = packetSize; index<alignedSize; index += packetSize)
{ {
packet_res = ei_pmadd( packet_res = ei_pmadd(
v1.template packet<Aligned>(index), v1.template packet<alignment1>(index),
v2.template packet<Aligned>(index), v2.template packet<alignment2>(index),
packet_res packet_res
); );
} }

View File

@ -2,6 +2,7 @@
// for linear algebra. Eigen itself is part of the KDE project. // for linear algebra. Eigen itself is part of the KDE project.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr> // Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -29,8 +30,8 @@
* *
* \brief A matrix or vector expression mapping an existing array of data. * \brief A matrix or vector expression mapping an existing array of data.
* *
* \param Alignment can be either Aligned or Unaligned. Tells whether the array is suitably aligned for * \param _PacketAccess controls whether vectorized aligned loads or stores are allowed (Aligned)
* vectorization on the present CPU architecture. Defaults to Unaligned. * or forced to unaligned (Unaligned). Defaults to Unaligned.
* *
* This class represents a matrix or vector expression mapping an existing array of data. * This class represents a matrix or vector expression mapping an existing array of data.
* It can be used to let Eigen interface without any overhead with non-Eigen data structures, * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
@ -40,117 +41,43 @@
* *
* \sa Matrix::map() * \sa Matrix::map()
*/ */
template<typename MatrixType, int Alignment> template<typename MatrixType, int _PacketAccess>
struct ei_traits<Map<MatrixType, Alignment> > struct ei_traits<Map<MatrixType, _PacketAccess> > : public ei_traits<MatrixType>
{ {
typedef typename MatrixType::Scalar Scalar;
enum { enum {
RowsAtCompileTime = MatrixType::RowsAtCompileTime, PacketAccess = _PacketAccess,
ColsAtCompileTime = MatrixType::ColsAtCompileTime, Flags = ei_traits<MatrixType>::Flags & ~AlignedBit
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
Flags = MatrixType::Flags,
CoeffReadCost = NumTraits<Scalar>::ReadCost
}; };
typedef typename ei_meta_if<int(PacketAccess)==Aligned,
Map<MatrixType, _PacketAccess>&,
Map<MatrixType, Aligned> >::ret AlignedDerivedType;
}; };
template<typename MatrixType, int Alignment> class Map template<typename MatrixType, int PacketAccess> class Map
: public MatrixBase<Map<MatrixType, Alignment> > : public MapBase<Map<MatrixType, PacketAccess> >
{ {
public: public:
EIGEN_GENERIC_PUBLIC_INTERFACE(Map) _EIGEN_GENERIC_PUBLIC_INTERFACE(Map, MapBase<Map>)
typedef typename ei_traits<Map>::AlignedDerivedType AlignedDerivedType;
inline int rows() const { return m_rows.value(); }
inline int cols() const { return m_cols.value(); }
inline int stride() const { return this->innerSize(); } inline int stride() const { return this->innerSize(); }
inline const Scalar& coeff(int row, int col) const AlignedDerivedType allowAligned()
{ {
if(Flags & RowMajorBit) if (PacketAccess==Aligned)
return m_data[col + row * m_cols.value()]; return *this;
else // column-major else
return m_data[row + col * m_rows.value()]; return Map<MatrixType,Aligned>(Base::m_data, Base::m_rows.value(), Base::m_cols.value());
} }
inline Scalar& coeffRef(int row, int col) inline Map(const Scalar* data) : Base(data) {}
{
if(Flags & RowMajorBit)
return const_cast<Scalar*>(m_data)[col + row * m_cols.value()];
else // column-major
return const_cast<Scalar*>(m_data)[row + col * m_rows.value()];
}
inline const Scalar& coeff(int index) const inline Map(const Scalar* data, int size) : Base(data, size) {}
{
return m_data[index];
}
inline Scalar& coeffRef(int index) inline Map(const Scalar* data, int rows, int cols) : Base(data, rows, cols) {}
{
return *const_cast<Scalar*>(m_data + index);
}
template<int LoadMode>
inline PacketScalar packet(int row, int col) const
{
return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>
(m_data + (Flags & RowMajorBit
? col + row * m_cols.value()
: row + col * m_rows.value()));
}
template<int LoadMode>
inline PacketScalar packet(int index) const
{
return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>(m_data + index);
}
template<int StoreMode>
inline void writePacket(int row, int col, const PacketScalar& x)
{
ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>
(const_cast<Scalar*>(m_data) + (Flags & RowMajorBit
? col + row * m_cols.value()
: row + col * m_rows.value()), x);
}
template<int StoreMode>
inline void writePacket(int index, const PacketScalar& x)
{
ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>
(const_cast<Scalar*>(m_data) + index, x);
}
inline Map(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(MatrixType)
}
inline Map(const Scalar* data, int size)
: m_data(data),
m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime),
m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(MatrixType)
ei_assert(size > 0);
ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
}
inline Map(const Scalar* data, int rows, int cols)
: m_data(data), m_rows(rows), m_cols(cols)
{
ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
&& cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
protected:
const Scalar* m_data;
const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
}; };
/** Constructor copying an existing array of data. /** Constructor copying an existing array of data.

167
Eigen/src/Core/MapBase.h Normal file
View File

@ -0,0 +1,167 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_MAPBASE_H
#define EIGEN_MAPBASE_H
/** \internal
*
* \class MapBase
*
* \brief Base class for Map and Block expression with direct access
*
* \sa class Map, class Block
*/
template<typename Derived> class MapBase
: public MatrixBase<Derived>
{
public:
typedef MatrixBase<Derived> Base;
enum {
IsRowMajor = int(ei_traits<Derived>::Flags) & RowMajorBit ? 1 : 0,
PacketAccess = ei_traits<Derived>::PacketAccess,
RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime,
SizeAtCompileTime = Base::SizeAtCompileTime
};
typedef typename ei_traits<Derived>::AlignedDerivedType AlignedDerivedType;
typedef typename ei_traits<Derived>::Scalar Scalar;
typedef typename Base::PacketScalar PacketScalar;
using Base::derived;
inline int rows() const { return m_rows.value(); }
inline int cols() const { return m_cols.value(); }
inline int stride() const { return derived().stride(); }
AlignedDerivedType allowAligned() { return derived().allowAligned(); }
inline const Scalar& coeff(int row, int col) const
{
if(IsRowMajor)
return m_data[col + row * stride()];
else // column-major
return m_data[row + col * stride()];
}
inline Scalar& coeffRef(int row, int col)
{
if(IsRowMajor)
return const_cast<Scalar*>(m_data)[col + row * stride()];
else // column-major
return const_cast<Scalar*>(m_data)[row + col * stride()];
}
inline const Scalar coeff(int index) const
{
ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
if ( ((RowsAtCompileTime == 1) == IsRowMajor) )
return m_data[index];
else
return m_data[index*stride()];
}
inline Scalar& coeffRef(int index)
{
return *const_cast<Scalar*>(m_data + index);
}
template<int LoadMode>
inline PacketScalar packet(int row, int col) const
{
return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode>
(m_data + (IsRowMajor ? col + row * stride()
: row + col * stride()));
}
template<int LoadMode>
inline PacketScalar packet(int index) const
{
return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode>(m_data + index);
}
template<int StoreMode>
inline void writePacket(int row, int col, const PacketScalar& x)
{
ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode>
(const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride()
: row + col * stride()), x);
}
template<int StoreMode>
inline void writePacket(int index, const PacketScalar& x)
{
ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode>
(const_cast<Scalar*>(m_data) + index, x);
}
inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
}
inline MapBase(const Scalar* data, int size)
: m_data(data),
m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime),
m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
ei_assert(size > 0);
ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
}
inline MapBase(const Scalar* data, int rows, int cols)
: m_data(data), m_rows(rows), m_cols(cols)
{
ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
&& cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MapBase)
// EIGEN_INHERIT_ASSIGNMENT_OPERATOR(MapBase, =)
template<typename OtherDerived>
Derived& operator+=(const MatrixBase<OtherDerived>& other)
{ return derived() = allowAligned() + other; }
template<typename OtherDerived>
Derived& operator-=(const MatrixBase<OtherDerived>& other)
{ return derived() = allowAligned() - other; }
Derived& operator*=(const Scalar& other)
{ return derived() = allowAligned() * other; }
Derived& operator/=(const Scalar& other)
{ return derived() = allowAligned() / other; }
protected:
const Scalar* __restrict__ m_data;
const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
};
#endif // EIGEN_MAPBASE_H

View File

@ -257,9 +257,9 @@ template<typename Derived> class MatrixBase
void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other); void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other);
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeff(int index, const MatrixBase<OtherDerived>& other); void copyCoeff(int index, const MatrixBase<OtherDerived>& other);
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other); void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other);
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int index, const MatrixBase<OtherDerived>& other); void copyPacket(int index, const MatrixBase<OtherDerived>& other);
template<int LoadMode> template<int LoadMode>

View File

@ -33,17 +33,22 @@
template<typename Derived> template<typename Derived>
struct ei_sum_traits struct ei_sum_traits
{ {
private:
enum {
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
};
public: public:
enum { enum {
Vectorization = (int(Derived::Flags)&ActualPacketAccessBit) Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
&& (int(Derived::Flags)&LinearAccessBit) && (int(Derived::Flags)&LinearAccessBit)
&& (int(Derived::SizeAtCompileTime)>2*PacketSize)
? LinearVectorization ? LinearVectorization
: NoVectorization : NoVectorization
}; };
private: private:
enum { enum {
PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
+ (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost, + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize)) UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
@ -131,7 +136,8 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
: Index % Derived::RowsAtCompileTime, : Index % Derived::RowsAtCompileTime,
col = int(Derived::Flags)&RowMajorBit col = int(Derived::Flags)&RowMajorBit
? Index % int(Derived::ColsAtCompileTime) ? Index % int(Derived::ColsAtCompileTime)
: Index / Derived::RowsAtCompileTime : Index / Derived::RowsAtCompileTime,
alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
}; };
typedef typename Derived::Scalar Scalar; typedef typename Derived::Scalar Scalar;
@ -139,7 +145,7 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
inline static PacketScalar run(const Derived &mat) inline static PacketScalar run(const Derived &mat)
{ {
return mat.template packet<Aligned>(row, col); return mat.template packet<alignment>(row, col);
} }
}; };
@ -185,14 +191,21 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
{ {
const int size = mat.size(); const int size = mat.size();
const int packetSize = ei_packet_traits<Scalar>::size; const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize; const int alignedStart = (Derived::Flags & AlignedBit)
|| !(Derived::Flags & DirectAccessBit)
? 0
: ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
const int alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
? Aligned : Unaligned;
const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
const int alignedEnd = alignedStart + alignedSize;
Scalar res; Scalar res;
if(size >= packetSize) if(Derived::SizeAtCompileTime>=2*packetSize && alignedSize >= 2*packetSize)
{ {
PacketScalar packet_res = mat.template packet<Aligned>(0, 0); PacketScalar packet_res = mat.template packet<alignment>(alignedStart, alignedStart);
for(int index = packetSize; index < alignedSize; index += packetSize) for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index)); packet_res = ei_padd(packet_res, mat.template packet<alignment>(index));
res = ei_predux(packet_res); res = ei_predux(packet_res);
} }
@ -202,10 +215,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
res = Scalar(0); res = Scalar(0);
} }
for(int index = alignedSize; index < size; index++) for(int index = alignedEnd; index < size; index++)
{ res += mat.coeff(index);
for(int index = alignedEnd; index < size; index++)
res += mat.coeff(index); res += mat.coeff(index);
}
return res; return res;
} }

View File

@ -59,6 +59,16 @@ template<typename ExpressionType> class SwapWrapper
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int stride() const { return m_expression.stride(); }
inline Scalar& coeffRef(int row, int col)
{
return m_expression.const_cast_derived().coeffRef(row, col);
}
inline Scalar& coeffRef(int index)
{
return m_expression.const_cast_derived().coeffRef(index);
}
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other) void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other)
{ {
@ -80,29 +90,29 @@ template<typename ExpressionType> class SwapWrapper
_other.coeffRef(index) = tmp; _other.coeffRef(index) = tmp;
} }
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other) void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(row >= 0 && row < rows() ei_internal_assert(row >= 0 && row < rows()
&& col >= 0 && col < cols()); && col >= 0 && col < cols());
Packet tmp = m_expression.template packet<LoadStoreMode>(row, col); Packet tmp = m_expression.template packet<StoreMode>(row, col);
m_expression.template writePacket<LoadStoreMode>(row, col, m_expression.template writePacket<StoreMode>(row, col,
_other.template packet<LoadStoreMode>(row, col) _other.template packet<LoadMode>(row, col)
); );
_other.template writePacket<LoadStoreMode>(row, col, tmp); _other.template writePacket<LoadMode>(row, col, tmp);
} }
template<typename OtherDerived, int LoadStoreMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int index, const MatrixBase<OtherDerived>& other) void copyPacket(int index, const MatrixBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(index >= 0 && index < m_expression.size()); ei_internal_assert(index >= 0 && index < m_expression.size());
Packet tmp = m_expression.template packet<LoadStoreMode>(index); Packet tmp = m_expression.template packet<StoreMode>(index);
m_expression.template writePacket<LoadStoreMode>(index, m_expression.template writePacket<StoreMode>(index,
_other.template packet<LoadStoreMode>(index) _other.template packet<LoadMode>(index)
); );
_other.template writePacket<LoadStoreMode>(index, tmp); _other.template writePacket<LoadMode>(index, tmp);
} }
protected: protected:

View File

@ -119,42 +119,47 @@ const unsigned int LinearAccessBit = 0x10;
*/ */
const unsigned int DirectAccessBit = 0x20; const unsigned int DirectAccessBit = 0x20;
/** \ingroup flags
*
* means the first coefficient packet is guaranteed to be aligned */
const unsigned int AlignedBit = 0x40;
/** \ingroup flags /** \ingroup flags
* *
* means all diagonal coefficients are equal to 0 */ * means all diagonal coefficients are equal to 0 */
const unsigned int ZeroDiagBit = 0x40; const unsigned int ZeroDiagBit = 0x80;
/** \ingroup flags /** \ingroup flags
* *
* means all diagonal coefficients are equal to 1 */ * means all diagonal coefficients are equal to 1 */
const unsigned int UnitDiagBit = 0x80; const unsigned int UnitDiagBit = 0x100;
/** \ingroup flags /** \ingroup flags
* *
* means the matrix is selfadjoint (M=M*). */ * means the matrix is selfadjoint (M=M*). */
const unsigned int SelfAdjointBit = 0x100; const unsigned int SelfAdjointBit = 0x200;
/** \ingroup flags /** \ingroup flags
* *
* means the strictly lower triangular part is 0 */ * means the strictly lower triangular part is 0 */
const unsigned int UpperTriangularBit = 0x200; const unsigned int UpperTriangularBit = 0x400;
/** \ingroup flags /** \ingroup flags
* *
* means the strictly upper triangular part is 0 */ * means the strictly upper triangular part is 0 */
const unsigned int LowerTriangularBit = 0x400; const unsigned int LowerTriangularBit = 0x800;
/** \ingroup flags /** \ingroup flags
* *
* means the expression includes sparse matrices and the sparse path has to be taken. */ * means the expression includes sparse matrices and the sparse path has to be taken. */
const unsigned int SparseBit = 0x800; const unsigned int SparseBit = 0x1000;
/** \ingroup flags /** \ingroup flags
* *
* currently unused. Means the matrix probably has a very big size. * currently unused. Means the matrix probably has a very big size.
* Could eventually be used as a hint to determine which algorithms * Could eventually be used as a hint to determine which algorithms
* to use. */ * to use. */
const unsigned int LargeBit = 0x1000; const unsigned int LargeBit = 0x2000;
// list of flags that are inherited by default // list of flags that are inherited by default
const unsigned int HereditaryBits = RowMajorBit const unsigned int HereditaryBits = RowMajorBit
@ -175,15 +180,21 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit; const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
const unsigned int Diagonal = Upper | Lower; const unsigned int Diagonal = Upper | Lower;
enum { Aligned=0, Unaligned=1 }; enum { Aligned=0, Unaligned=1, Unknown=2 };
enum { ConditionalJumpCost = 5 }; enum { ConditionalJumpCost = 5 };
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
enum DirectionType { Vertical, Horizontal }; enum DirectionType { Vertical, Horizontal };
enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, SparseProduct }; enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, SparseProduct };
enum { enum {
/** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignement
* and good size */
InnerVectorization, InnerVectorization,
/** \internal Vectorization path using a single loop plus scalar loops for the
* unaligned boundaries */
LinearVectorization, LinearVectorization,
/** \internal Generic vectorization path using one vectorized loop per row/column with some
* scalar loops to handle the unaligned boundaries */
SliceVectorization, SliceVectorization,
NoVectorization NoVectorization
}; };

View File

@ -43,8 +43,8 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
template<typename ExpressionType> class NestByValue; template<typename ExpressionType> class NestByValue;
template<typename ExpressionType> class SwapWrapper; template<typename ExpressionType> class SwapWrapper;
template<typename MatrixType> class Minor; template<typename MatrixType> class Minor;
template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, int PacketAccess=Unaligned,
int DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block; int _DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block;
template<typename MatrixType> class Transpose; template<typename MatrixType> class Transpose;
template<typename MatrixType> class Conjugate; template<typename MatrixType> class Conjugate;
template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp; template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp;
@ -53,7 +53,7 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class CwiseBinaryOp;
template<typename Lhs, typename Rhs, int ProductMode> class Product; template<typename Lhs, typename Rhs, int ProductMode> class Product;
template<typename CoeffsVectorType> class DiagonalMatrix; template<typename CoeffsVectorType> class DiagonalMatrix;
template<typename MatrixType> class DiagonalCoeffs; template<typename MatrixType> class DiagonalCoeffs;
template<typename MatrixType, int Alignment = Unaligned> class Map; template<typename MatrixType, int PacketAccess = Unaligned> class Map;
template<typename MatrixType, unsigned int Mode> class Part; template<typename MatrixType, unsigned int Mode> class Part;
template<typename MatrixType, unsigned int Mode> class Extract; template<typename MatrixType, unsigned int Mode> class Extract;
template<typename ExpressionType> class Cwise; template<typename ExpressionType> class Cwise;

View File

@ -168,12 +168,14 @@ class ei_corrected_matrix_flags
packet_access_bit packet_access_bit
= ei_packet_traits<Scalar>::size > 1 = ei_packet_traits<Scalar>::size > 1
&& (is_big || linear_size%ei_packet_traits<Scalar>::size==0) && (is_big || linear_size%ei_packet_traits<Scalar>::size==0)
? PacketAccessBit : 0 ? PacketAccessBit : 0,
aligned_bit = packet_access_bit
&& (is_big || linear_size%ei_packet_traits<Scalar>::size==0) ? AlignedBit : 0
}; };
public: public:
enum { ret = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | PacketAccessBit | RowMajorBit)) enum { ret = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | PacketAccessBit | RowMajorBit))
| LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit
}; };
}; };

View File

@ -105,8 +105,11 @@ public:
/** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
* its four coefficients \a w, \a x, \a y and \a z. * its four coefficients \a w, \a x, \a y and \a z.
*
* \warning Note the order of the arguments: the real \a w coefficient first,
* while internally the coefficients are stored in the following order:
* [\c x, \c y, \c z, \c w]
*/ */
// FIXME what is the prefered order: w x,y,z or x,y,z,w ?
inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z) inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
{ m_coeffs << x, y, z, w; } { m_coeffs << x, y, z, w; }
@ -313,8 +316,8 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBas
} }
/** \returns the multiplicative inverse of \c *this /** \returns the multiplicative inverse of \c *this
* Note that in most cases, i.e., if you simply want the opposite * Note that in most cases, i.e., if you simply want the opposite rotation,
* rotation, it is enough to use the conjugate. * and/or the quaternion is normalized, then it is enough to use the conjugate.
* *
* \sa Quaternion::conjugate() * \sa Quaternion::conjugate()
*/ */