mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 02:33:59 +08:00
* add bench/benchVecAdd.cpp by Gael, fix crash (ei_pload on non-aligned)
* introduce packet(int), make use of it in linear vectorized paths --> completely fixes the slowdown noticed in benchVecAdd. * generalize coeff(int) to linear-access xprs * clarify the access flag bits * rework api dox in Coeffs.h and util/Constants.h * improve certain expressions's flags, allowing more vectorization * fix bug in Block: start(int) and end(int) returned dyn*dyn size * fix bug in Block: just because the Eval type has packet access doesn't imply the block xpr should have it too.
This commit is contained in:
parent
5b0da4b778
commit
25ba9f377c
@ -301,45 +301,14 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
||||
const int size = dst.size();
|
||||
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||
const int alignedSize = (size/packetSize)*packetSize;
|
||||
const bool rowMajor = Derived1::Flags&RowMajorBit;
|
||||
const int innerSize = rowMajor ? dst.cols() : dst.rows();
|
||||
const int outerSize = rowMajor ? dst.rows() : dst.cols();
|
||||
int index = 0;
|
||||
|
||||
// do the vectorizable part of the assignment
|
||||
int row = 0;
|
||||
int col = 0;
|
||||
while (index<alignedSize)
|
||||
for(int index = 0; index < alignedSize; index += packetSize)
|
||||
{
|
||||
int start = rowMajor ? col : row;
|
||||
int end = std::min(innerSize, start + alignedSize-index);
|
||||
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
|
||||
dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
|
||||
index += (rowMajor ? col : row) - start;
|
||||
row = rowMajor ? index/innerSize : index%innerSize;
|
||||
col = rowMajor ? index%innerSize : index/innerSize;
|
||||
dst.template writePacket<Aligned>(index, src.template packet<Aligned>(index));
|
||||
}
|
||||
|
||||
// now we must do the rest without vectorization.
|
||||
if(alignedSize == size) return;
|
||||
const int k = alignedSize/innerSize;
|
||||
|
||||
// do the remainder of the current row or col
|
||||
for(int i = alignedSize%innerSize; i < innerSize; i++)
|
||||
{
|
||||
const int row = rowMajor ? k : i;
|
||||
const int col = rowMajor ? i : k;
|
||||
dst.coeffRef(row, col) = src.coeff(row, col);
|
||||
}
|
||||
|
||||
// do the remaining rows or cols
|
||||
for(int j = k+1; j < outerSize; j++)
|
||||
for(int i = 0; i < innerSize; i++)
|
||||
{
|
||||
const int row = rowMajor ? i : j;
|
||||
const int col = rowMajor ? j : i;
|
||||
dst.coeffRef(row, col) = src.coeff(row, col);
|
||||
}
|
||||
for(int index = alignedSize; index < size; index++)
|
||||
dst.coeffRef(index) = src.coeff(index);
|
||||
}
|
||||
};
|
||||
|
||||
@ -351,23 +320,9 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
|
||||
const int size = Derived1::SizeAtCompileTime;
|
||||
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||
const int alignedSize = (size/packetSize)*packetSize;
|
||||
const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
|
||||
const int innerSize = rowMajor ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime);
|
||||
const int outerSize = rowMajor ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime);
|
||||
|
||||
// do the vectorizable part of the assignment
|
||||
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
|
||||
|
||||
// now we must do the rest without vectorization.
|
||||
const int k = alignedSize/innerSize;
|
||||
const int i = alignedSize%innerSize;
|
||||
|
||||
// do the remainder of the current row or col
|
||||
ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, k<outerSize ? innerSize : 0>::run(dst, src, k);
|
||||
|
||||
// do the remaining rows or cols
|
||||
for(int j = k+1; j < outerSize; j++)
|
||||
ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
|
||||
ei_assign_novec_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
|
||||
}
|
||||
};
|
||||
|
||||
@ -432,8 +387,8 @@ template<typename Derived, typename OtherDerived,
|
||||
struct ei_assign_selector;
|
||||
|
||||
template<typename Derived, typename OtherDerived>
|
||||
struct ei_assign_selector<Derived,OtherDerived,true,true> {
|
||||
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
|
||||
struct ei_assign_selector<Derived,OtherDerived,false,false> {
|
||||
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
|
||||
};
|
||||
template<typename Derived, typename OtherDerived>
|
||||
struct ei_assign_selector<Derived,OtherDerived,true,false> {
|
||||
@ -444,8 +399,8 @@ struct ei_assign_selector<Derived,OtherDerived,false,true> {
|
||||
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
|
||||
};
|
||||
template<typename Derived, typename OtherDerived>
|
||||
struct ei_assign_selector<Derived,OtherDerived,false,false> {
|
||||
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
|
||||
struct ei_assign_selector<Derived,OtherDerived,true,true> {
|
||||
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
|
@ -71,13 +71,12 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
|
||||
|| (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
|
||||
? ~LargeBit
|
||||
: ~(unsigned int)0,
|
||||
MaskPacketAccessBit = ei_corrected_matrix_flags<
|
||||
Scalar, RowsAtCompileTime, ColsAtCompileTime,
|
||||
MaxRowsAtCompileTime, MaxColsAtCompileTime, MatrixType::Flags
|
||||
>::ret & PacketAccessBit,
|
||||
FlagsLinearAccessBit = MatrixType::Flags & RowMajorBit
|
||||
? (RowsAtCompileTime == 1 ? LinearAccessBit : 0)
|
||||
: (ColsAtCompileTime == 1 ? LinearAccessBit : 0),
|
||||
RowMajor = int(MatrixType::Flags)&RowMajorBit,
|
||||
InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
|
||||
InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
|
||||
MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
|
||||
? PacketAccessBit : 0,
|
||||
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
|
||||
Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
|
||||
| FlagsLinearAccessBit,
|
||||
CoeffReadCost = MatrixType::CoeffReadCost
|
||||
@ -153,6 +152,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
|
||||
return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_matrix.const_cast_derived()
|
||||
.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
|
||||
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
|
||||
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_matrix
|
||||
.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
|
||||
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
@ -165,6 +179,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
|
||||
m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int index) const
|
||||
{
|
||||
return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
|
||||
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline void _writePacket(int index, const PacketScalar& x)
|
||||
{
|
||||
m_matrix.const_cast_derived().template writePacket<UnAligned>
|
||||
(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
|
||||
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
const typename MatrixType::Nested m_matrix;
|
||||
@ -260,22 +289,30 @@ inline const Block<Derived> MatrixBase<Derived>
|
||||
* \sa class Block, block(int,int)
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Block<Derived> MatrixBase<Derived>::start(int size)
|
||||
inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
|
||||
MatrixBase<Derived>::start(int size)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
return Block<Derived>(derived(), 0, 0,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
return Block<Derived,
|
||||
RowsAtCompileTime == 1 ? 1 : Dynamic,
|
||||
ColsAtCompileTime == 1 ? 1 : Dynamic>
|
||||
(derived(), 0, 0,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
}
|
||||
|
||||
/** This is the const version of start(int).*/
|
||||
template<typename Derived>
|
||||
inline const Block<Derived> MatrixBase<Derived>::start(int size) const
|
||||
inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
|
||||
MatrixBase<Derived>::start(int size) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
return Block<Derived>(derived(), 0, 0,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
return Block<Derived,
|
||||
RowsAtCompileTime == 1 ? 1 : Dynamic,
|
||||
ColsAtCompileTime == 1 ? 1 : Dynamic>
|
||||
(derived(), 0, 0,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
}
|
||||
|
||||
/** \returns a dynamic-size expression of the last coefficients of *this.
|
||||
@ -294,26 +331,34 @@ inline const Block<Derived> MatrixBase<Derived>::start(int size) const
|
||||
* \sa class Block, block(int,int)
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Block<Derived> MatrixBase<Derived>::end(int size)
|
||||
inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
|
||||
MatrixBase<Derived>::end(int size)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
return Block<Derived>(derived(),
|
||||
RowsAtCompileTime == 1 ? 0 : rows() - size,
|
||||
ColsAtCompileTime == 1 ? 0 : cols() - size,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
return Block<Derived,
|
||||
RowsAtCompileTime == 1 ? 1 : Dynamic,
|
||||
ColsAtCompileTime == 1 ? 1 : Dynamic>
|
||||
(derived(),
|
||||
RowsAtCompileTime == 1 ? 0 : rows() - size,
|
||||
ColsAtCompileTime == 1 ? 0 : cols() - size,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
}
|
||||
|
||||
/** This is the const version of end(int).*/
|
||||
template<typename Derived>
|
||||
inline const Block<Derived> MatrixBase<Derived>::end(int size) const
|
||||
inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
|
||||
MatrixBase<Derived>::end(int size) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
return Block<Derived>(derived(),
|
||||
RowsAtCompileTime == 1 ? 0 : rows() - size,
|
||||
ColsAtCompileTime == 1 ? 0 : cols() - size,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
return Block<Derived,
|
||||
RowsAtCompileTime == 1 ? 1 : Dynamic,
|
||||
ColsAtCompileTime == 1 ? 1 : Dynamic>
|
||||
(derived(),
|
||||
RowsAtCompileTime == 1 ? 0 : rows() - size,
|
||||
ColsAtCompileTime == 1 ? 0 : cols() - size,
|
||||
RowsAtCompileTime == 1 ? 1 : size,
|
||||
ColsAtCompileTime == 1 ? 1 : size);
|
||||
}
|
||||
|
||||
/** \returns a fixed-size expression of the first coefficients of *this.
|
||||
|
@ -104,7 +104,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
* \link operator[](int) const \endlink, but without the assertion.
|
||||
* Use this for limiting the performance cost of debugging code when doing
|
||||
* repeated coefficient access. Only use this when it is guaranteed that the
|
||||
* parameters \a row and \a col are in range.
|
||||
* parameter \a index is in range.
|
||||
*
|
||||
* If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
|
||||
* function equivalent to \link operator[](int) const \endlink.
|
||||
@ -115,22 +115,13 @@ template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::coeff(int index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
if(RowsAtCompileTime == 1)
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < cols());
|
||||
return coeff(0, index);
|
||||
}
|
||||
else
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < rows());
|
||||
return coeff(index, 0);
|
||||
}
|
||||
ei_internal_assert(index >= 0 && index < size());
|
||||
return derived()._coeff(index);
|
||||
}
|
||||
|
||||
/** \returns the coefficient at given index.
|
||||
*
|
||||
* \only_for_vectors
|
||||
* This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
|
||||
*
|
||||
* \sa operator[](int), operator()(int,int) const, x() const, y() const,
|
||||
* z() const, w() const
|
||||
@ -139,17 +130,8 @@ template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::operator[](int index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
if(RowsAtCompileTime == 1)
|
||||
{
|
||||
ei_assert(index >= 0 && index < cols());
|
||||
return coeff(0, index);
|
||||
}
|
||||
else
|
||||
{
|
||||
ei_assert(index >= 0 && index < rows());
|
||||
return coeff(index, 0);
|
||||
}
|
||||
ei_assert(index >= 0 && index < size());
|
||||
return derived()._coeff(index);
|
||||
}
|
||||
|
||||
/** Short version: don't use this function, use
|
||||
@ -170,22 +152,13 @@ template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::coeffRef(int index)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
if(RowsAtCompileTime == 1)
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < cols());
|
||||
return coeffRef(0, index);
|
||||
}
|
||||
else
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < rows());
|
||||
return coeffRef(index, 0);
|
||||
}
|
||||
ei_internal_assert(index >= 0 && index < size());
|
||||
return derived()._coeffRef(index);
|
||||
}
|
||||
|
||||
/** \returns a reference to the coefficient at given index.
|
||||
*
|
||||
* \only_for_vectors
|
||||
* This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
|
||||
*
|
||||
* \sa operator[](int) const, operator()(int,int), x(), y(), z(), w()
|
||||
*/
|
||||
@ -193,70 +166,119 @@ template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::operator[](int index)
|
||||
{
|
||||
ei_assert(IsVectorAtCompileTime);
|
||||
if(RowsAtCompileTime == 1)
|
||||
{
|
||||
ei_assert(index >= 0 && index < cols());
|
||||
return coeffRef(0, index);
|
||||
}
|
||||
else
|
||||
{
|
||||
ei_assert(index >= 0 && index < rows());
|
||||
return coeffRef(index, 0);
|
||||
}
|
||||
ei_assert(index >= 0 && index < size());
|
||||
return derived()._coeffRef(index);
|
||||
}
|
||||
|
||||
/** equivalent to operator[](0). \only_for_vectors */
|
||||
/** equivalent to operator[](0). */
|
||||
template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::x() const { return (*this)[0]; }
|
||||
|
||||
/** equivalent to operator[](1). \only_for_vectors */
|
||||
/** equivalent to operator[](1). */
|
||||
template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::y() const { return (*this)[1]; }
|
||||
|
||||
/** equivalent to operator[](2). \only_for_vectors */
|
||||
/** equivalent to operator[](2). */
|
||||
template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::z() const { return (*this)[2]; }
|
||||
|
||||
/** equivalent to operator[](3). \only_for_vectors */
|
||||
/** equivalent to operator[](3). */
|
||||
template<typename Derived>
|
||||
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
|
||||
::w() const { return (*this)[3]; }
|
||||
|
||||
/** equivalent to operator[](0). \only_for_vectors */
|
||||
/** equivalent to operator[](0). */
|
||||
template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::x() { return (*this)[0]; }
|
||||
|
||||
/** equivalent to operator[](1). \only_for_vectors */
|
||||
/** equivalent to operator[](1). */
|
||||
template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::y() { return (*this)[1]; }
|
||||
|
||||
/** equivalent to operator[](2). \only_for_vectors */
|
||||
/** equivalent to operator[](2). */
|
||||
template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::z() { return (*this)[2]; }
|
||||
|
||||
/** equivalent to operator[](3). \only_for_vectors */
|
||||
/** equivalent to operator[](3). */
|
||||
template<typename Derived>
|
||||
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
|
||||
::w() { return (*this)[3]; }
|
||||
|
||||
/** \returns the packet of coefficients starting at the given row and column. It is your responsibility
|
||||
* to ensure that a packet really starts there. This method is only available on expressions having the
|
||||
* PacketAccessBit.
|
||||
*
|
||||
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
|
||||
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
|
||||
* starting at an address which is a multiple of the packet size.
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int LoadMode>
|
||||
inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
|
||||
MatrixBase<Derived>::packet(int row, int col) const
|
||||
{ return derived().template _packet<LoadMode>(row,col); }
|
||||
{
|
||||
ei_internal_assert(row >= 0 && row < rows()
|
||||
&& col >= 0 && col < cols());
|
||||
return derived().template _packet<LoadMode>(row,col);
|
||||
}
|
||||
|
||||
/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
|
||||
* to ensure that a packet really starts there. This method is only available on expressions having the
|
||||
* PacketAccessBit.
|
||||
*
|
||||
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
|
||||
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
|
||||
* starting at an address which is a multiple of the packet size.
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int StoreMode>
|
||||
inline void MatrixBase<Derived>::writePacket
|
||||
(int row, int col, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
|
||||
{ derived().template _writePacket<StoreMode>(row,col,x); }
|
||||
{
|
||||
ei_internal_assert(row >= 0 && row < rows()
|
||||
&& col >= 0 && col < cols());
|
||||
derived().template _writePacket<StoreMode>(row,col,x);
|
||||
}
|
||||
|
||||
/** \returns the packet of coefficients starting at the given index. It is your responsibility
|
||||
* to ensure that a packet really starts there. This method is only available on expressions having the
|
||||
* PacketAccessBit and the LinearAccessBit.
|
||||
*
|
||||
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
|
||||
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
|
||||
* starting at an address which is a multiple of the packet size.
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int LoadMode>
|
||||
inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
|
||||
MatrixBase<Derived>::packet(int index) const
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < size());
|
||||
return derived().template _packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
/** Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
|
||||
* to ensure that a packet really starts there. This method is only available on expressions having the
|
||||
* PacketAccessBit and the LinearAccessBit.
|
||||
*
|
||||
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
|
||||
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
|
||||
* starting at an address which is a multiple of the packet size.
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int StoreMode>
|
||||
inline void MatrixBase<Derived>::writePacket
|
||||
(int index, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
|
||||
{
|
||||
ei_internal_assert(index >= 0 && index < size());
|
||||
derived().template _writePacket<StoreMode>(index,x);
|
||||
}
|
||||
|
||||
|
||||
#endif // EIGEN_COEFFS_H
|
||||
|
@ -69,7 +69,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
|
||||
HereditaryBits
|
||||
| (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
|
||||
| (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
|
||||
? int(LhsFlags) & int(RhsFlags) & PacketAccessBit : 0)),
|
||||
? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
|
||||
CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
|
||||
};
|
||||
};
|
||||
@ -108,6 +108,17 @@ class CwiseBinaryOp : ei_no_assignment_operator,
|
||||
return m_functor.packetOp(m_lhs.template packet<LoadMode>(row, col), m_rhs.template packet<LoadMode>(row, col));
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_functor(m_lhs.coeff(index), m_rhs.coeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int index) const
|
||||
{
|
||||
return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
protected:
|
||||
const LhsNested m_lhs;
|
||||
const RhsNested m_rhs;
|
||||
|
@ -50,7 +50,9 @@ struct ei_traits<CwiseNullaryOp<NullaryOp, MatrixType> >
|
||||
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
|
||||
Flags = (MatrixType::Flags
|
||||
& (HereditaryBits | LinearAccessBit | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
|
||||
& ( HereditaryBits
|
||||
| (ei_functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
|
||||
| (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
|
||||
| (ei_functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
|
||||
CoeffReadCost = ei_functor_traits<NullaryOp>::Cost
|
||||
};
|
||||
@ -89,6 +91,17 @@ class CwiseNullaryOp : ei_no_assignment_operator,
|
||||
return m_functor.packetOp();
|
||||
}
|
||||
|
||||
const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_functor(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
PacketScalar _packet(int) const
|
||||
{
|
||||
return m_functor.packetOp();
|
||||
}
|
||||
|
||||
protected:
|
||||
const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
|
||||
const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
|
||||
|
@ -90,6 +90,17 @@ class CwiseUnaryOp : ei_no_assignment_operator,
|
||||
return m_functor.packetOp(m_matrix.template packet<LoadMode>(row, col));
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_functor(m_matrix.coeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int index) const
|
||||
{
|
||||
return m_functor.packetOp(m_matrix.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
protected:
|
||||
const typename MatrixType::Nested m_matrix;
|
||||
const UnaryOp m_functor;
|
||||
|
@ -56,7 +56,8 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
|
||||
MaxColsAtCompileTime = 1,
|
||||
Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
|
||||
? (unsigned int)_MatrixTypeNested::Flags
|
||||
: (unsigned int)_MatrixTypeNested::Flags &~ LargeBit) & HereditaryBits,
|
||||
: (unsigned int)_MatrixTypeNested::Flags &~ LargeBit)
|
||||
& (HereditaryBits | LinearAccessBit),
|
||||
CoeffReadCost = _MatrixTypeNested::CoeffReadCost
|
||||
};
|
||||
};
|
||||
@ -87,6 +88,16 @@ template<typename MatrixType> class DiagonalCoeffs
|
||||
return m_matrix.coeff(row, row);
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_matrix.const_cast_derived().coeffRef(index, index);
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_matrix.coeff(index, index);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
const typename MatrixType::Nested m_matrix;
|
||||
|
@ -52,8 +52,7 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
|
||||
&& (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
||||
|
||||
RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
|
||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
|
||||
| LinearAccessBit,
|
||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
||||
|
||||
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
|
||||
| (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
|
||||
|
@ -175,27 +175,20 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
||||
const int size = v1.size();
|
||||
const int packetSize = ei_packet_traits<Scalar>::size;
|
||||
const int alignedSize = (size/packetSize)*packetSize;
|
||||
const bool rowVector1 = Derived1::RowsAtCompileTime == 1;
|
||||
const bool rowVector2 = Derived2::RowsAtCompileTime == 1;
|
||||
Scalar res;
|
||||
|
||||
// do the vectorizable part of the sum
|
||||
if(size >= packetSize)
|
||||
{
|
||||
PacketScalar packet_res;
|
||||
packet_res = ei_pmul(
|
||||
v1.template packet<Aligned>(0, 0),
|
||||
v2.template packet<Aligned>(0, 0)
|
||||
);
|
||||
PacketScalar packet_res = ei_pmul(
|
||||
v1.template packet<Aligned>(0),
|
||||
v2.template packet<Aligned>(0)
|
||||
);
|
||||
for(int index = packetSize; index<alignedSize; index += packetSize)
|
||||
{
|
||||
const int row1 = rowVector1 ? 0 : index;
|
||||
const int col1 = rowVector1 ? index : 0;
|
||||
const int row2 = rowVector2 ? 0 : index;
|
||||
const int col2 = rowVector2 ? index : 0;
|
||||
packet_res = ei_pmadd(
|
||||
v1.template packet<Aligned>(row1, col1),
|
||||
v2.template packet<Aligned>(row2, col2),
|
||||
v1.template packet<Aligned>(index),
|
||||
v2.template packet<Aligned>(index),
|
||||
packet_res
|
||||
);
|
||||
}
|
||||
@ -213,11 +206,7 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
||||
// do the remainder of the vector
|
||||
for(int index = alignedSize; index < size; index++)
|
||||
{
|
||||
const int row1 = rowVector1 ? 0 : index;
|
||||
const int col1 = rowVector1 ? index : 0;
|
||||
const int row2 = rowVector2 ? 0 : index;
|
||||
const int col2 = rowVector2 ? index : 0;
|
||||
res += v1.coeff(row1, col1) * v2.coeff(row2, col2);
|
||||
res += v1.coeff(index) * v2.coeff(index);
|
||||
}
|
||||
|
||||
return res;
|
||||
|
@ -84,6 +84,16 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
|
||||
return m_matrix.const_cast_derived().coeffRef(row, col);
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_matrix.coeff(index);
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_matrix.const_cast_derived().coeffRef(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
@ -96,6 +106,18 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
|
||||
m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar _packet(int index) const
|
||||
{
|
||||
return m_matrix.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline void _writePacket(int index, const PacketScalar& x)
|
||||
{
|
||||
m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
|
||||
}
|
||||
|
||||
protected:
|
||||
ExpressionTypeNested m_matrix;
|
||||
};
|
||||
|
@ -302,13 +302,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
|
||||
|
||||
// nullary functors
|
||||
|
||||
template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_constant_op;
|
||||
template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;
|
||||
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_constant_op<Scalar,true> {
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
|
||||
inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
|
||||
inline const Scalar operator() (int, int) const { return ei_pfirst(m_other); }
|
||||
inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
|
||||
inline const PacketScalar packetOp() const
|
||||
{ return m_other; }
|
||||
const PacketScalar m_other;
|
||||
@ -316,7 +316,7 @@ struct ei_scalar_constant_op<Scalar,true> {
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_constant_op<Scalar,false> {
|
||||
inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
|
||||
inline const Scalar operator() (int, int) const { return m_other; }
|
||||
inline const Scalar operator() (int, int = 0) const { return m_other; }
|
||||
const Scalar m_other;
|
||||
};
|
||||
template<typename Scalar>
|
||||
@ -331,4 +331,11 @@ template<typename Scalar>
|
||||
struct ei_functor_traits<ei_scalar_identity_op<Scalar> >
|
||||
{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
|
||||
|
||||
// NOTE quick hack:
|
||||
// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
|
||||
// to indicate whether a functor allows linear access, just always answering 'yes' except for
|
||||
// ei_scalar_identity_op.
|
||||
template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
|
||||
template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
|
||||
|
||||
#endif // EIGEN_FUNCTORS_H
|
||||
|
@ -31,14 +31,14 @@
|
||||
*/
|
||||
template<typename Derived>
|
||||
std::ostream & operator <<
|
||||
( std::ostream & s,
|
||||
const MatrixBase<Derived> & m )
|
||||
(std::ostream & s,
|
||||
const MatrixBase<Derived> & m)
|
||||
{
|
||||
for( int i = 0; i < m.rows(); i++ )
|
||||
for(int i = 0; i < m.rows(); i++)
|
||||
{
|
||||
s << m( i, 0 );
|
||||
for (int j = 1; j < m.cols(); j++ )
|
||||
s << " " << m( i, j );
|
||||
s << m.coeff(i, 0);
|
||||
for(int j = 1; j < m.cols(); j++)
|
||||
s << " " << m.coeff(i, j);
|
||||
if( i < m.rows() - 1)
|
||||
s << "\n";
|
||||
}
|
||||
|
@ -55,33 +55,33 @@ typename OtherDerived::Eval MatrixBase<Derived>::inverseProduct(const MatrixBase
|
||||
{
|
||||
// forward substitution
|
||||
if(Flags & UnitDiagBit)
|
||||
res.coeffRef(0,c) = other.coeff(0,c);
|
||||
res.coeffRef(0,c) = other.coeff(0,c);
|
||||
else
|
||||
res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
|
||||
res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
|
||||
for(int i=1; i<rows(); ++i)
|
||||
{
|
||||
Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
|
||||
if (Flags & UnitDiagBit)
|
||||
res.coeffRef(i,c) = tmp;
|
||||
else
|
||||
res.coeffRef(i,c) = tmp/coeff(i,i);
|
||||
Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
|
||||
if (Flags & UnitDiagBit)
|
||||
res.coeffRef(i,c) = tmp;
|
||||
else
|
||||
res.coeffRef(i,c) = tmp/coeff(i,i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// backward substitution
|
||||
if(Flags & UnitDiagBit)
|
||||
res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
|
||||
res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
|
||||
else
|
||||
res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
|
||||
res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
|
||||
for(int i=rows()-2 ; i>=0 ; --i)
|
||||
{
|
||||
Scalar tmp = other.coeff(i,c)
|
||||
- ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
|
||||
if (Flags & UnitDiagBit)
|
||||
res.coeffRef(i,c) = tmp;
|
||||
else
|
||||
res.coeffRef(i,c) = tmp/coeff(i,i);
|
||||
Scalar tmp = other.coeff(i,c)
|
||||
- ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
|
||||
if (Flags & UnitDiagBit)
|
||||
res.coeffRef(i,c) = tmp;
|
||||
else
|
||||
res.coeffRef(i,c) = tmp/coeff(i,i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -80,6 +80,16 @@ template<typename MatrixType> class Map
|
||||
return const_cast<Scalar*>(m_data)[row + col * m_rows];
|
||||
}
|
||||
|
||||
inline const Scalar& _coeff(int index) const
|
||||
{
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
public:
|
||||
inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
|
||||
{
|
||||
|
@ -128,6 +128,11 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
|
||||
return m_storage.data()[row + col * m_storage.rows()];
|
||||
}
|
||||
|
||||
inline const Scalar& _coeff(int index) const
|
||||
{
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int row, int col)
|
||||
{
|
||||
if(Flags & RowMajorBit)
|
||||
@ -136,20 +141,33 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
|
||||
return m_storage.data()[row + col * m_storage.rows()];
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
ei_internal_assert(Flags & PacketAccessBit);
|
||||
if(Flags & RowMajorBit)
|
||||
if (LoadMode==Aligned)
|
||||
return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
|
||||
return ei_pload(m_storage.data() + col + row * m_storage.cols());
|
||||
else
|
||||
return ei_ploadu(&m_storage.data()[col + row * m_storage.cols()]);
|
||||
return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
|
||||
else
|
||||
if (LoadMode==Aligned)
|
||||
return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
|
||||
return ei_pload(m_storage.data() + row + col * m_storage.rows());
|
||||
else
|
||||
return ei_ploadu(&m_storage.data()[row + col * m_storage.rows()]);
|
||||
return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline PacketScalar _packet(int index) const
|
||||
{
|
||||
if (LoadMode==Aligned)
|
||||
return ei_pload(m_storage.data() + index);
|
||||
else
|
||||
return ei_ploadu(m_storage.data() + index);
|
||||
}
|
||||
|
||||
template<int StoreMode>
|
||||
@ -158,14 +176,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
|
||||
ei_internal_assert(Flags & PacketAccessBit);
|
||||
if(Flags & RowMajorBit)
|
||||
if (StoreMode==Aligned)
|
||||
ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
|
||||
ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
|
||||
else
|
||||
ei_pstoreu(&m_storage.data()[col + row * m_storage.cols()], x);
|
||||
ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
|
||||
else
|
||||
if (StoreMode==Aligned)
|
||||
ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
|
||||
ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
|
||||
else
|
||||
ei_pstoreu(&m_storage.data()[row + col * m_storage.rows()], x);
|
||||
ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
|
||||
}
|
||||
|
||||
template<int StoreMode>
|
||||
inline void _writePacket(int index, const PacketScalar& x)
|
||||
{
|
||||
if (StoreMode==Aligned)
|
||||
ei_pstore(m_storage.data() + index, x);
|
||||
else
|
||||
ei_pstoreu(m_storage.data() + index, x);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -228,6 +228,11 @@ template<typename Derived> class MatrixBase
|
||||
template<int StoreMode>
|
||||
void writePacket(int row, int col, const PacketScalar& x);
|
||||
|
||||
template<int LoadMode>
|
||||
PacketScalar packet(int index) const;
|
||||
template<int StoreMode>
|
||||
void writePacket(int index, const PacketScalar& x);
|
||||
|
||||
const Scalar x() const;
|
||||
const Scalar y() const;
|
||||
const Scalar z() const;
|
||||
@ -307,11 +312,11 @@ template<typename Derived> class MatrixBase
|
||||
Block<Derived> block(int start, int size);
|
||||
const Block<Derived> block(int start, int size) const;
|
||||
|
||||
Block<Derived> start(int size);
|
||||
const Block<Derived> start(int size) const;
|
||||
typename SubVectorReturnType<Dynamic>::Type start(int size);
|
||||
const typename SubVectorReturnType<Dynamic>::Type start(int size) const;
|
||||
|
||||
Block<Derived> end(int size);
|
||||
const Block<Derived> end(int size) const;
|
||||
typename SubVectorReturnType<Dynamic>::Type end(int size);
|
||||
const typename SubVectorReturnType<Dynamic>::Type end(int size) const;
|
||||
|
||||
Block<Derived> corner(CornerType type, int cRows, int cCols);
|
||||
const Block<Derived> corner(CornerType type, int cRows, int cCols) const;
|
||||
|
@ -76,6 +76,16 @@ template<typename ExpressionType> class NestByValue
|
||||
return m_expression.const_cast_derived().coeffRef(row, col);
|
||||
}
|
||||
|
||||
inline const Scalar _coeff(int index) const
|
||||
{
|
||||
return m_expression.coeff(index);
|
||||
}
|
||||
|
||||
inline Scalar& _coeffRef(int index)
|
||||
{
|
||||
return m_expression.const_cast_derived().coeffRef(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
@ -88,6 +98,18 @@ template<typename ExpressionType> class NestByValue
|
||||
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar _packet(int index) const
|
||||
{
|
||||
return m_expression.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline void _writePacket(int index, const PacketScalar& x)
|
||||
{
|
||||
m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
|
||||
}
|
||||
|
||||
protected:
|
||||
const ExpressionType m_expression;
|
||||
};
|
||||
|
@ -151,8 +151,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
|
||||
&& (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
|
||||
|
||||
RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
|
||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
|
||||
| LinearAccessBit),
|
||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
||||
|
||||
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
|
||||
| EvalBeforeAssigningBit
|
||||
@ -224,6 +223,18 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
|
||||
* which is why we don't set the LinearAccessBit.
|
||||
*/
|
||||
const Scalar _coeff(int index) const
|
||||
{
|
||||
Scalar res;
|
||||
const int row = RowsAtCompileTime == 1 ? 0 : index;
|
||||
const int col = RowsAtCompileTime == 1 ? index : 0;
|
||||
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
const PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
@ -235,9 +246,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
||||
return res;
|
||||
}
|
||||
|
||||
template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
|
||||
friend struct ei_cache_friendly_selector;
|
||||
|
||||
protected:
|
||||
const LhsNested m_lhs;
|
||||
const RhsNested m_rhs;
|
||||
|
@ -186,34 +186,15 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
|
||||
const int size = mat.size();
|
||||
const int packetSize = ei_packet_traits<Scalar>::size;
|
||||
const int alignedSize = (size/packetSize)*packetSize;
|
||||
const bool rowMajor = Derived::Flags&RowMajorBit;
|
||||
const int innerSize = rowMajor ? mat.cols() : mat.rows();
|
||||
const int outerSize = rowMajor ? mat.rows() : mat.cols();
|
||||
Scalar res;
|
||||
|
||||
// do the vectorizable part of the sum
|
||||
if(size >= packetSize)
|
||||
{
|
||||
PacketScalar packet_res;
|
||||
packet_res = mat.template packet<Aligned>(0, 0);
|
||||
int row = 0;
|
||||
int col = 0;
|
||||
int index = packetSize;
|
||||
while (index<alignedSize)
|
||||
{
|
||||
row = rowMajor ? index/innerSize : index%innerSize;
|
||||
col = rowMajor ? index%innerSize : index/innerSize;
|
||||
int start = rowMajor ? col : row;
|
||||
int end = std::min(innerSize, start+alignedSize-index);
|
||||
if (end<start) getchar();
|
||||
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
|
||||
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
|
||||
index += (rowMajor ? col : row) - start;
|
||||
}
|
||||
res = ei_predux(packet_res);
|
||||
PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
|
||||
for(int index = packetSize; index < alignedSize; index += packetSize)
|
||||
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
|
||||
|
||||
// now we must do the rest without vectorization.
|
||||
if(alignedSize == size) return res;
|
||||
res = ei_predux(packet_res);
|
||||
}
|
||||
else // too small to vectorize anything.
|
||||
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
|
||||
@ -221,25 +202,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
|
||||
res = Scalar(0);
|
||||
}
|
||||
|
||||
const int k = alignedSize/innerSize;
|
||||
|
||||
// do the remainder of the current row or col
|
||||
for(int i = alignedSize%innerSize; i < innerSize; i++)
|
||||
for(int index = alignedSize; index < size; index++)
|
||||
{
|
||||
const int row = rowMajor ? k : i;
|
||||
const int col = rowMajor ? i : k;
|
||||
res += mat.coeff(row, col);
|
||||
res += mat.coeff(index);
|
||||
}
|
||||
|
||||
// do the remaining rows or cols
|
||||
for(int j = k+1; j < outerSize; j++)
|
||||
for(int i = 0; i < innerSize; i++)
|
||||
{
|
||||
const int row = rowMajor ? i : j;
|
||||
const int col = rowMajor ? j : i;
|
||||
res += mat.coeff(row, col);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
@ -56,26 +56,60 @@ const unsigned int EvalBeforeNestingBit = 0x2;
|
||||
* means the expression should be evaluated before any assignement */
|
||||
const unsigned int EvalBeforeAssigningBit = 0x4;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* currently unused. Means the matrix probably has a very big size.
|
||||
* Could eventually be used as a hint to determine which algorithms
|
||||
* to use. */
|
||||
const unsigned int LargeBit = 0x8;
|
||||
|
||||
#ifdef EIGEN_VECTORIZE
|
||||
/** \ingroup flags
|
||||
*
|
||||
* means the expression might be vectorized */
|
||||
const unsigned int PacketAccessBit = 0x10;
|
||||
* Short version: means the expression might be vectorized
|
||||
*
|
||||
* Long version: means that the coefficients can be handled by packets
|
||||
* and start at a memory location whose alignment meets the requirements
|
||||
* of the present CPU architecture for optimized packet access. In the fixed-size
|
||||
* case, there is the additional condition that the total size of the coefficients
|
||||
* array is a multiple of the packet size, so that it is possible to access all the
|
||||
* coefficients by packets. In the dynamic-size case, there is no such condition
|
||||
* on the total size, so it might not be possible to access the few last coeffs
|
||||
* by packets.
|
||||
*
|
||||
* \note If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
|
||||
* is set to the value 0.
|
||||
*/
|
||||
const unsigned int PacketAccessBit = 0x8;
|
||||
#else
|
||||
const unsigned int PacketAccessBit = 0x0;
|
||||
#endif
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* means the expression can be seen as 1D vector (used for explicit vectorization) */
|
||||
const unsigned int LinearAccessBit = 0x20;
|
||||
* Short version: means the expression can be seen as 1D vector.
|
||||
*
|
||||
* Long version: means that one can access the coefficients
|
||||
* of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
|
||||
* index-based access methods are guaranteed
|
||||
* to not have to do any runtime computation of a (row, col)-pair from the index, so that it
|
||||
* is guaranteed that whenever it is available, index-based access is at least as fast as
|
||||
* (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
|
||||
*
|
||||
* If both PacketAccessBit and LinearAccessBit are set, then the
|
||||
* packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
|
||||
* lvalue expression.
|
||||
*
|
||||
* Typically, all vector expressions have the LinearAccessBit, but there is one exception:
|
||||
* Product expressions don't have it, because it would be troublesome for vectorization, even when the
|
||||
* Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
|
||||
* not index-based packet access, so they don't have the LinearAccessBit.
|
||||
*/
|
||||
const unsigned int LinearAccessBit = 0x10;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* Means that the underlying array of coefficients can be directly accessed. This means two things.
|
||||
* First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
|
||||
* expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
|
||||
* array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
|
||||
* This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
|
||||
* such a regular memory layout.
|
||||
*/
|
||||
const unsigned int DirectAccessBit = 0x20;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
@ -102,19 +136,19 @@ const unsigned int UpperTriangularBit = 0x200;
|
||||
* means the strictly upper triangular part is 0 */
|
||||
const unsigned int LowerTriangularBit = 0x400;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* means the underlying matrix data can be direclty accessed (contrary to certain
|
||||
* expressions where the matrix coefficients need to be computed rather than just read from
|
||||
* memory) */
|
||||
const unsigned int DirectAccessBit = 0x800;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* means the object is just an array of scalars, and operations on it are regarded as operations
|
||||
* on every of these scalars taken separately.
|
||||
*/
|
||||
const unsigned int ArrayBit = 0x1000;
|
||||
const unsigned int ArrayBit = 0x800;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* currently unused. Means the matrix probably has a very big size.
|
||||
* Could eventually be used as a hint to determine which algorithms
|
||||
* to use. */
|
||||
const unsigned int LargeBit = 0x1000;
|
||||
|
||||
// list of flags that are inherited by default
|
||||
const unsigned int HereditaryBits = RowMajorBit
|
||||
@ -135,7 +169,6 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
|
||||
const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
|
||||
const unsigned int Diagonal = Upper | Lower;
|
||||
|
||||
|
||||
enum { Aligned=0, UnAligned=1 };
|
||||
enum { ConditionalJumpCost = 5 };
|
||||
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
|
||||
|
@ -155,7 +155,8 @@ class ei_corrected_matrix_flags
|
||||
// so let us strictly honor the user's choice.
|
||||
? SuggestedFlags&RowMajorBit
|
||||
: Cols > 1 ? RowMajorBit : 0,
|
||||
is_big = MaxRows == Dynamic || MaxCols == Dynamic,
|
||||
inner_max_size = row_major_bit ? MaxCols : MaxRows,
|
||||
is_big = inner_max_size == Dynamic,
|
||||
linear_size = Cols * Rows,
|
||||
packet_access_bit
|
||||
= ei_packet_traits<Scalar>::size > 1
|
||||
|
@ -75,7 +75,9 @@
|
||||
|
||||
|
||||
// static assertion failling if the type \a TYPE is not a vector type
|
||||
#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime,you_tried_calling_a_vector_method_on_a_matrix)
|
||||
#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
|
||||
EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
|
||||
you_tried_calling_a_vector_method_on_a_matrix)
|
||||
|
||||
// static assertion failling if the two vector expression types are not compatible (same fixed-size or dynamic size)
|
||||
#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
|
||||
|
134
bench/benchVecAdd.cpp
Normal file
134
bench/benchVecAdd.cpp
Normal file
@ -0,0 +1,134 @@
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <bench/BenchTimer.h>
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SIZE
|
||||
#define SIZE 50
|
||||
#endif
|
||||
|
||||
#ifndef REPEAT
|
||||
#define REPEAT 10000
|
||||
#endif
|
||||
|
||||
typedef float Scalar;
|
||||
|
||||
__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
|
||||
__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
|
||||
__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int size = SIZE * 8;
|
||||
int size2 = size * size;
|
||||
Scalar* a = ei_aligned_malloc<Scalar>(size2);
|
||||
Scalar* b = ei_aligned_malloc<Scalar>(size2);
|
||||
Scalar* c = ei_aligned_malloc<Scalar>(size2);
|
||||
|
||||
for (int i=0; i<size; ++i)
|
||||
{
|
||||
a[i] = b[i] = c[i] = 0;
|
||||
}
|
||||
|
||||
BenchTimer timer;
|
||||
|
||||
timer.reset();
|
||||
for (int k=0; k<3; ++k)
|
||||
{
|
||||
timer.start();
|
||||
benchVec(a, b, c, size2);
|
||||
timer.stop();
|
||||
}
|
||||
std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
|
||||
|
||||
for (int innersize = size; innersize>2 ; --innersize)
|
||||
{
|
||||
if (size2%innersize==0)
|
||||
{
|
||||
int outersize = size2/innersize;
|
||||
MatrixXf ma = MatrixXf::map(a, innersize, outersize );
|
||||
MatrixXf mb = MatrixXf::map(b, innersize, outersize );
|
||||
MatrixXf mc = MatrixXf::map(c, innersize, outersize );
|
||||
timer.reset();
|
||||
for (int k=0; k<3; ++k)
|
||||
{
|
||||
timer.start();
|
||||
benchVec(ma, mb, mc);
|
||||
timer.stop();
|
||||
}
|
||||
std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
|
||||
}
|
||||
}
|
||||
|
||||
VectorXf va = VectorXf::map(a, size2);
|
||||
VectorXf vb = VectorXf::map(b, size2);
|
||||
VectorXf vc = VectorXf::map(c, size2);
|
||||
timer.reset();
|
||||
for (int k=0; k<3; ++k)
|
||||
{
|
||||
timer.start();
|
||||
benchVec(va, vb, vc);
|
||||
timer.stop();
|
||||
}
|
||||
std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
|
||||
{
|
||||
for (int k=0; k<REPEAT; ++k)
|
||||
a = a + b;
|
||||
}
|
||||
|
||||
void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
|
||||
{
|
||||
for (int k=0; k<REPEAT; ++k)
|
||||
a = a + b;
|
||||
}
|
||||
|
||||
void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
|
||||
{
|
||||
typedef ei_packet_traits<Scalar>::type PacketScalar;
|
||||
const int PacketSize = ei_packet_traits<Scalar>::size;
|
||||
PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
for (int k=0; k<REPEAT; ++k)
|
||||
for (int i=0; i<size; i+=PacketSize*8)
|
||||
{
|
||||
a0 = ei_pload(&a[i]);
|
||||
b0 = ei_pload(&b[i]);
|
||||
a1 = ei_pload(&a[i+1*PacketSize]);
|
||||
b1 = ei_pload(&b[i+1*PacketSize]);
|
||||
a2 = ei_pload(&a[i+2*PacketSize]);
|
||||
b2 = ei_pload(&b[i+2*PacketSize]);
|
||||
a3 = ei_pload(&a[i+3*PacketSize]);
|
||||
b3 = ei_pload(&b[i+3*PacketSize]);
|
||||
ei_pstore(&a[i], ei_padd(a0, b0));
|
||||
a0 = ei_pload(&a[i+4*PacketSize]);
|
||||
b0 = ei_pload(&b[i+4*PacketSize]);
|
||||
|
||||
ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
|
||||
a1 = ei_pload(&a[i+5*PacketSize]);
|
||||
b1 = ei_pload(&b[i+5*PacketSize]);
|
||||
|
||||
ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
|
||||
a2 = ei_pload(&a[i+6*PacketSize]);
|
||||
b2 = ei_pload(&b[i+6*PacketSize]);
|
||||
|
||||
ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
|
||||
a3 = ei_pload(&a[i+7*PacketSize]);
|
||||
b3 = ei_pload(&b[i+7*PacketSize]);
|
||||
|
||||
ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
|
||||
ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
|
||||
ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
|
||||
ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
|
||||
|
||||
// ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
|
||||
// ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
|
||||
// ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
|
||||
// ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
|
||||
// ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
|
||||
// ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user