* add bench/benchVecAdd.cpp by Gael, fix crash (ei_pload on non-aligned)

* introduce packet(int), make use of it in linear vectorized paths
  --> completely fixes the slowdown noticed in benchVecAdd.
* generalize coeff(int) to linear-access xprs
* clarify the access flag bits
* rework api dox in Coeffs.h and util/Constants.h
* improve certain expressions's flags, allowing more vectorization
* fix bug in Block: start(int) and end(int) returned dyn*dyn size
* fix bug in Block: just because the Eval type has packet access
  doesn't imply the block xpr should have it too.
This commit is contained in:
Benoit Jacob 2008-06-26 16:06:41 +00:00
parent 5b0da4b778
commit 25ba9f377c
23 changed files with 558 additions and 264 deletions

View File

@ -301,45 +301,14 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
const int size = dst.size();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
const bool rowMajor = Derived1::Flags&RowMajorBit;
const int innerSize = rowMajor ? dst.cols() : dst.rows();
const int outerSize = rowMajor ? dst.rows() : dst.cols();
int index = 0;
// do the vectorizable part of the assignment
int row = 0;
int col = 0;
while (index<alignedSize)
for(int index = 0; index < alignedSize; index += packetSize)
{
int start = rowMajor ? col : row;
int end = std::min(innerSize, start + alignedSize-index);
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
index += (rowMajor ? col : row) - start;
row = rowMajor ? index/innerSize : index%innerSize;
col = rowMajor ? index%innerSize : index/innerSize;
dst.template writePacket<Aligned>(index, src.template packet<Aligned>(index));
}
// now we must do the rest without vectorization.
if(alignedSize == size) return;
const int k = alignedSize/innerSize;
// do the remainder of the current row or col
for(int i = alignedSize%innerSize; i < innerSize; i++)
{
const int row = rowMajor ? k : i;
const int col = rowMajor ? i : k;
dst.coeffRef(row, col) = src.coeff(row, col);
}
// do the remaining rows or cols
for(int j = k+1; j < outerSize; j++)
for(int i = 0; i < innerSize; i++)
{
const int row = rowMajor ? i : j;
const int col = rowMajor ? j : i;
dst.coeffRef(row, col) = src.coeff(row, col);
}
for(int index = alignedSize; index < size; index++)
dst.coeffRef(index) = src.coeff(index);
}
};
@ -351,23 +320,9 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
const int size = Derived1::SizeAtCompileTime;
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
const int innerSize = rowMajor ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime);
const int outerSize = rowMajor ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime);
// do the vectorizable part of the assignment
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
// now we must do the rest without vectorization.
const int k = alignedSize/innerSize;
const int i = alignedSize%innerSize;
// do the remainder of the current row or col
ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, k<outerSize ? innerSize : 0>::run(dst, src, k);
// do the remaining rows or cols
for(int j = k+1; j < outerSize; j++)
ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
ei_assign_novec_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
}
};
@ -432,8 +387,8 @@ template<typename Derived, typename OtherDerived,
struct ei_assign_selector;
template<typename Derived, typename OtherDerived>
struct ei_assign_selector<Derived,OtherDerived,true,true> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
struct ei_assign_selector<Derived,OtherDerived,false,false> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
};
template<typename Derived, typename OtherDerived>
struct ei_assign_selector<Derived,OtherDerived,true,false> {
@ -444,8 +399,8 @@ struct ei_assign_selector<Derived,OtherDerived,false,true> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
};
template<typename Derived, typename OtherDerived>
struct ei_assign_selector<Derived,OtherDerived,false,false> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
struct ei_assign_selector<Derived,OtherDerived,true,true> {
static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
};
template<typename Derived>

View File

@ -71,13 +71,12 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
|| (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
? ~LargeBit
: ~(unsigned int)0,
MaskPacketAccessBit = ei_corrected_matrix_flags<
Scalar, RowsAtCompileTime, ColsAtCompileTime,
MaxRowsAtCompileTime, MaxColsAtCompileTime, MatrixType::Flags
>::ret & PacketAccessBit,
FlagsLinearAccessBit = MatrixType::Flags & RowMajorBit
? (RowsAtCompileTime == 1 ? LinearAccessBit : 0)
: (ColsAtCompileTime == 1 ? LinearAccessBit : 0),
RowMajor = int(MatrixType::Flags)&RowMajorBit,
InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
| FlagsLinearAccessBit,
CoeffReadCost = MatrixType::CoeffReadCost
@ -153,6 +152,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
}
inline Scalar& _coeffRef(int index)
{
return m_matrix.const_cast_derived()
.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
}
inline const Scalar _coeff(int index) const
{
return m_matrix
.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
}
template<int LoadMode>
inline PacketScalar _packet(int row, int col) const
{
@ -165,6 +179,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
}
template<int LoadMode>
inline PacketScalar _packet(int index) const
{
return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
}
template<int LoadMode>
inline void _writePacket(int index, const PacketScalar& x)
{
m_matrix.const_cast_derived().template writePacket<UnAligned>
(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
}
protected:
const typename MatrixType::Nested m_matrix;
@ -260,22 +289,30 @@ inline const Block<Derived> MatrixBase<Derived>
* \sa class Block, block(int,int)
*/
template<typename Derived>
inline Block<Derived> MatrixBase<Derived>::start(int size)
inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
MatrixBase<Derived>::start(int size)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
return Block<Derived>(derived(), 0, 0,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
return Block<Derived,
RowsAtCompileTime == 1 ? 1 : Dynamic,
ColsAtCompileTime == 1 ? 1 : Dynamic>
(derived(), 0, 0,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
}
/** This is the const version of start(int).*/
template<typename Derived>
inline const Block<Derived> MatrixBase<Derived>::start(int size) const
inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
MatrixBase<Derived>::start(int size) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
return Block<Derived>(derived(), 0, 0,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
return Block<Derived,
RowsAtCompileTime == 1 ? 1 : Dynamic,
ColsAtCompileTime == 1 ? 1 : Dynamic>
(derived(), 0, 0,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
}
/** \returns a dynamic-size expression of the last coefficients of *this.
@ -294,26 +331,34 @@ inline const Block<Derived> MatrixBase<Derived>::start(int size) const
* \sa class Block, block(int,int)
*/
template<typename Derived>
inline Block<Derived> MatrixBase<Derived>::end(int size)
inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
MatrixBase<Derived>::end(int size)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
return Block<Derived>(derived(),
RowsAtCompileTime == 1 ? 0 : rows() - size,
ColsAtCompileTime == 1 ? 0 : cols() - size,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
return Block<Derived,
RowsAtCompileTime == 1 ? 1 : Dynamic,
ColsAtCompileTime == 1 ? 1 : Dynamic>
(derived(),
RowsAtCompileTime == 1 ? 0 : rows() - size,
ColsAtCompileTime == 1 ? 0 : cols() - size,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
}
/** This is the const version of end(int).*/
template<typename Derived>
inline const Block<Derived> MatrixBase<Derived>::end(int size) const
inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
MatrixBase<Derived>::end(int size) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
return Block<Derived>(derived(),
RowsAtCompileTime == 1 ? 0 : rows() - size,
ColsAtCompileTime == 1 ? 0 : cols() - size,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
return Block<Derived,
RowsAtCompileTime == 1 ? 1 : Dynamic,
ColsAtCompileTime == 1 ? 1 : Dynamic>
(derived(),
RowsAtCompileTime == 1 ? 0 : rows() - size,
ColsAtCompileTime == 1 ? 0 : cols() - size,
RowsAtCompileTime == 1 ? 1 : size,
ColsAtCompileTime == 1 ? 1 : size);
}
/** \returns a fixed-size expression of the first coefficients of *this.

View File

@ -104,7 +104,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
* \link operator[](int) const \endlink, but without the assertion.
* Use this for limiting the performance cost of debugging code when doing
* repeated coefficient access. Only use this when it is guaranteed that the
* parameters \a row and \a col are in range.
* parameter \a index is in range.
*
* If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
* function equivalent to \link operator[](int) const \endlink.
@ -115,22 +115,13 @@ template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::coeff(int index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
if(RowsAtCompileTime == 1)
{
ei_internal_assert(index >= 0 && index < cols());
return coeff(0, index);
}
else
{
ei_internal_assert(index >= 0 && index < rows());
return coeff(index, 0);
}
ei_internal_assert(index >= 0 && index < size());
return derived()._coeff(index);
}
/** \returns the coefficient at given index.
*
* \only_for_vectors
* This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
*
* \sa operator[](int), operator()(int,int) const, x() const, y() const,
* z() const, w() const
@ -139,17 +130,8 @@ template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::operator[](int index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
if(RowsAtCompileTime == 1)
{
ei_assert(index >= 0 && index < cols());
return coeff(0, index);
}
else
{
ei_assert(index >= 0 && index < rows());
return coeff(index, 0);
}
ei_assert(index >= 0 && index < size());
return derived()._coeff(index);
}
/** Short version: don't use this function, use
@ -170,22 +152,13 @@ template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::coeffRef(int index)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
if(RowsAtCompileTime == 1)
{
ei_internal_assert(index >= 0 && index < cols());
return coeffRef(0, index);
}
else
{
ei_internal_assert(index >= 0 && index < rows());
return coeffRef(index, 0);
}
ei_internal_assert(index >= 0 && index < size());
return derived()._coeffRef(index);
}
/** \returns a reference to the coefficient at given index.
*
* \only_for_vectors
* This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
*
* \sa operator[](int) const, operator()(int,int), x(), y(), z(), w()
*/
@ -193,70 +166,119 @@ template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::operator[](int index)
{
ei_assert(IsVectorAtCompileTime);
if(RowsAtCompileTime == 1)
{
ei_assert(index >= 0 && index < cols());
return coeffRef(0, index);
}
else
{
ei_assert(index >= 0 && index < rows());
return coeffRef(index, 0);
}
ei_assert(index >= 0 && index < size());
return derived()._coeffRef(index);
}
/** equivalent to operator[](0). \only_for_vectors */
/** equivalent to operator[](0). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::x() const { return (*this)[0]; }
/** equivalent to operator[](1). \only_for_vectors */
/** equivalent to operator[](1). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::y() const { return (*this)[1]; }
/** equivalent to operator[](2). \only_for_vectors */
/** equivalent to operator[](2). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::z() const { return (*this)[2]; }
/** equivalent to operator[](3). \only_for_vectors */
/** equivalent to operator[](3). */
template<typename Derived>
inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
::w() const { return (*this)[3]; }
/** equivalent to operator[](0). \only_for_vectors */
/** equivalent to operator[](0). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::x() { return (*this)[0]; }
/** equivalent to operator[](1). \only_for_vectors */
/** equivalent to operator[](1). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::y() { return (*this)[1]; }
/** equivalent to operator[](2). \only_for_vectors */
/** equivalent to operator[](2). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::z() { return (*this)[2]; }
/** equivalent to operator[](3). \only_for_vectors */
/** equivalent to operator[](3). */
template<typename Derived>
inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
::w() { return (*this)[3]; }
/** \returns the packet of coefficients starting at the given row and column. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit.
*
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
* starting at an address which is a multiple of the packet size.
*/
template<typename Derived>
template<int LoadMode>
inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
MatrixBase<Derived>::packet(int row, int col) const
{ return derived().template _packet<LoadMode>(row,col); }
{
ei_internal_assert(row >= 0 && row < rows()
&& col >= 0 && col < cols());
return derived().template _packet<LoadMode>(row,col);
}
/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit.
*
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
* starting at an address which is a multiple of the packet size.
*/
template<typename Derived>
template<int StoreMode>
inline void MatrixBase<Derived>::writePacket
(int row, int col, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
{ derived().template _writePacket<StoreMode>(row,col,x); }
{
ei_internal_assert(row >= 0 && row < rows()
&& col >= 0 && col < cols());
derived().template _writePacket<StoreMode>(row,col,x);
}
/** \returns the packet of coefficients starting at the given index. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit and the LinearAccessBit.
*
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
* starting at an address which is a multiple of the packet size.
*/
template<typename Derived>
template<int LoadMode>
inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
MatrixBase<Derived>::packet(int index) const
{
ei_internal_assert(index >= 0 && index < size());
return derived().template _packet<LoadMode>(index);
}
/** Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit and the LinearAccessBit.
*
* The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
* the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
* starting at an address which is a multiple of the packet size.
*/
template<typename Derived>
template<int StoreMode>
inline void MatrixBase<Derived>::writePacket
(int index, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
{
ei_internal_assert(index >= 0 && index < size());
derived().template _writePacket<StoreMode>(index,x);
}
#endif // EIGEN_COEFFS_H

View File

@ -69,7 +69,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
HereditaryBits
| (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
| (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
? int(LhsFlags) & int(RhsFlags) & PacketAccessBit : 0)),
? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
};
};
@ -108,6 +108,17 @@ class CwiseBinaryOp : ei_no_assignment_operator,
return m_functor.packetOp(m_lhs.template packet<LoadMode>(row, col), m_rhs.template packet<LoadMode>(row, col));
}
inline const Scalar _coeff(int index) const
{
return m_functor(m_lhs.coeff(index), m_rhs.coeff(index));
}
template<int LoadMode>
inline PacketScalar _packet(int index) const
{
return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
}
protected:
const LhsNested m_lhs;
const RhsNested m_rhs;

View File

@ -50,7 +50,9 @@ struct ei_traits<CwiseNullaryOp<NullaryOp, MatrixType> >
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
Flags = (MatrixType::Flags
& (HereditaryBits | LinearAccessBit | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
& ( HereditaryBits
| (ei_functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
| (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
| (ei_functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
CoeffReadCost = ei_functor_traits<NullaryOp>::Cost
};
@ -89,6 +91,17 @@ class CwiseNullaryOp : ei_no_assignment_operator,
return m_functor.packetOp();
}
const Scalar _coeff(int index) const
{
return m_functor(index);
}
template<int LoadMode>
PacketScalar _packet(int) const
{
return m_functor.packetOp();
}
protected:
const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
const ei_int_if_dynamic<ColsAtCompileTime> m_cols;

View File

@ -90,6 +90,17 @@ class CwiseUnaryOp : ei_no_assignment_operator,
return m_functor.packetOp(m_matrix.template packet<LoadMode>(row, col));
}
inline const Scalar _coeff(int index) const
{
return m_functor(m_matrix.coeff(index));
}
template<int LoadMode>
inline PacketScalar _packet(int index) const
{
return m_functor.packetOp(m_matrix.template packet<LoadMode>(index));
}
protected:
const typename MatrixType::Nested m_matrix;
const UnaryOp m_functor;

View File

@ -56,7 +56,8 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
MaxColsAtCompileTime = 1,
Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
? (unsigned int)_MatrixTypeNested::Flags
: (unsigned int)_MatrixTypeNested::Flags &~ LargeBit) & HereditaryBits,
: (unsigned int)_MatrixTypeNested::Flags &~ LargeBit)
& (HereditaryBits | LinearAccessBit),
CoeffReadCost = _MatrixTypeNested::CoeffReadCost
};
};
@ -87,6 +88,16 @@ template<typename MatrixType> class DiagonalCoeffs
return m_matrix.coeff(row, row);
}
inline Scalar& _coeffRef(int index)
{
return m_matrix.const_cast_derived().coeffRef(index, index);
}
inline const Scalar _coeff(int index) const
{
return m_matrix.coeff(index, index);
}
protected:
const typename MatrixType::Nested m_matrix;

View File

@ -52,8 +52,7 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
&& (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
| LinearAccessBit,
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
| (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),

View File

@ -175,27 +175,20 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
const int size = v1.size();
const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
const bool rowVector1 = Derived1::RowsAtCompileTime == 1;
const bool rowVector2 = Derived2::RowsAtCompileTime == 1;
Scalar res;
// do the vectorizable part of the sum
if(size >= packetSize)
{
PacketScalar packet_res;
packet_res = ei_pmul(
v1.template packet<Aligned>(0, 0),
v2.template packet<Aligned>(0, 0)
);
PacketScalar packet_res = ei_pmul(
v1.template packet<Aligned>(0),
v2.template packet<Aligned>(0)
);
for(int index = packetSize; index<alignedSize; index += packetSize)
{
const int row1 = rowVector1 ? 0 : index;
const int col1 = rowVector1 ? index : 0;
const int row2 = rowVector2 ? 0 : index;
const int col2 = rowVector2 ? index : 0;
packet_res = ei_pmadd(
v1.template packet<Aligned>(row1, col1),
v2.template packet<Aligned>(row2, col2),
v1.template packet<Aligned>(index),
v2.template packet<Aligned>(index),
packet_res
);
}
@ -213,11 +206,7 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
// do the remainder of the vector
for(int index = alignedSize; index < size; index++)
{
const int row1 = rowVector1 ? 0 : index;
const int col1 = rowVector1 ? index : 0;
const int row2 = rowVector2 ? 0 : index;
const int col2 = rowVector2 ? index : 0;
res += v1.coeff(row1, col1) * v2.coeff(row2, col2);
res += v1.coeff(index) * v2.coeff(index);
}
return res;

View File

@ -84,6 +84,16 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
return m_matrix.const_cast_derived().coeffRef(row, col);
}
inline const Scalar _coeff(int index) const
{
return m_matrix.coeff(index);
}
inline Scalar& _coeffRef(int index)
{
return m_matrix.const_cast_derived().coeffRef(index);
}
template<int LoadMode>
inline const PacketScalar _packet(int row, int col) const
{
@ -96,6 +106,18 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
}
template<int LoadMode>
inline const PacketScalar _packet(int index) const
{
return m_matrix.template packet<LoadMode>(index);
}
template<int LoadMode>
inline void _writePacket(int index, const PacketScalar& x)
{
m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
}
protected:
ExpressionTypeNested m_matrix;
};

View File

@ -302,13 +302,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
// nullary functors
template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_constant_op;
template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;
template<typename Scalar>
struct ei_scalar_constant_op<Scalar,true> {
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
inline const Scalar operator() (int, int) const { return ei_pfirst(m_other); }
inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
inline const PacketScalar packetOp() const
{ return m_other; }
const PacketScalar m_other;
@ -316,7 +316,7 @@ struct ei_scalar_constant_op<Scalar,true> {
template<typename Scalar>
struct ei_scalar_constant_op<Scalar,false> {
inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
inline const Scalar operator() (int, int) const { return m_other; }
inline const Scalar operator() (int, int = 0) const { return m_other; }
const Scalar m_other;
};
template<typename Scalar>
@ -331,4 +331,11 @@ template<typename Scalar>
struct ei_functor_traits<ei_scalar_identity_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
// NOTE quick hack:
// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
// to indicate whether a functor allows linear access, just always answering 'yes' except for
// ei_scalar_identity_op.
template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
#endif // EIGEN_FUNCTORS_H

View File

@ -31,14 +31,14 @@
*/
template<typename Derived>
std::ostream & operator <<
( std::ostream & s,
const MatrixBase<Derived> & m )
(std::ostream & s,
const MatrixBase<Derived> & m)
{
for( int i = 0; i < m.rows(); i++ )
for(int i = 0; i < m.rows(); i++)
{
s << m( i, 0 );
for (int j = 1; j < m.cols(); j++ )
s << " " << m( i, j );
s << m.coeff(i, 0);
for(int j = 1; j < m.cols(); j++)
s << " " << m.coeff(i, j);
if( i < m.rows() - 1)
s << "\n";
}

View File

@ -55,33 +55,33 @@ typename OtherDerived::Eval MatrixBase<Derived>::inverseProduct(const MatrixBase
{
// forward substitution
if(Flags & UnitDiagBit)
res.coeffRef(0,c) = other.coeff(0,c);
res.coeffRef(0,c) = other.coeff(0,c);
else
res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
for(int i=1; i<rows(); ++i)
{
Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
if (Flags & UnitDiagBit)
res.coeffRef(i,c) = tmp;
else
res.coeffRef(i,c) = tmp/coeff(i,i);
Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
if (Flags & UnitDiagBit)
res.coeffRef(i,c) = tmp;
else
res.coeffRef(i,c) = tmp/coeff(i,i);
}
}
else
{
// backward substitution
if(Flags & UnitDiagBit)
res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
else
res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
for(int i=rows()-2 ; i>=0 ; --i)
{
Scalar tmp = other.coeff(i,c)
- ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
if (Flags & UnitDiagBit)
res.coeffRef(i,c) = tmp;
else
res.coeffRef(i,c) = tmp/coeff(i,i);
Scalar tmp = other.coeff(i,c)
- ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
if (Flags & UnitDiagBit)
res.coeffRef(i,c) = tmp;
else
res.coeffRef(i,c) = tmp/coeff(i,i);
}
}
}

View File

@ -80,6 +80,16 @@ template<typename MatrixType> class Map
return const_cast<Scalar*>(m_data)[row + col * m_rows];
}
inline const Scalar& _coeff(int index) const
{
return m_data[index];
}
inline Scalar& _coeffRef(int index)
{
return m_data[index];
}
public:
inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
{

View File

@ -128,6 +128,11 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
return m_storage.data()[row + col * m_storage.rows()];
}
inline const Scalar& _coeff(int index) const
{
return m_storage.data()[index];
}
inline Scalar& _coeffRef(int row, int col)
{
if(Flags & RowMajorBit)
@ -136,20 +141,33 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
return m_storage.data()[row + col * m_storage.rows()];
}
inline Scalar& _coeffRef(int index)
{
return m_storage.data()[index];
}
template<int LoadMode>
inline PacketScalar _packet(int row, int col) const
{
ei_internal_assert(Flags & PacketAccessBit);
if(Flags & RowMajorBit)
if (LoadMode==Aligned)
return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
return ei_pload(m_storage.data() + col + row * m_storage.cols());
else
return ei_ploadu(&m_storage.data()[col + row * m_storage.cols()]);
return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
else
if (LoadMode==Aligned)
return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
return ei_pload(m_storage.data() + row + col * m_storage.rows());
else
return ei_ploadu(&m_storage.data()[row + col * m_storage.rows()]);
return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
}
template<int LoadMode>
inline PacketScalar _packet(int index) const
{
if (LoadMode==Aligned)
return ei_pload(m_storage.data() + index);
else
return ei_ploadu(m_storage.data() + index);
}
template<int StoreMode>
@ -158,14 +176,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
ei_internal_assert(Flags & PacketAccessBit);
if(Flags & RowMajorBit)
if (StoreMode==Aligned)
ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
else
ei_pstoreu(&m_storage.data()[col + row * m_storage.cols()], x);
ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
else
if (StoreMode==Aligned)
ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
else
ei_pstoreu(&m_storage.data()[row + col * m_storage.rows()], x);
ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
}
template<int StoreMode>
inline void _writePacket(int index, const PacketScalar& x)
{
if (StoreMode==Aligned)
ei_pstore(m_storage.data() + index, x);
else
ei_pstoreu(m_storage.data() + index, x);
}
public:

View File

@ -228,6 +228,11 @@ template<typename Derived> class MatrixBase
template<int StoreMode>
void writePacket(int row, int col, const PacketScalar& x);
template<int LoadMode>
PacketScalar packet(int index) const;
template<int StoreMode>
void writePacket(int index, const PacketScalar& x);
const Scalar x() const;
const Scalar y() const;
const Scalar z() const;
@ -307,11 +312,11 @@ template<typename Derived> class MatrixBase
Block<Derived> block(int start, int size);
const Block<Derived> block(int start, int size) const;
Block<Derived> start(int size);
const Block<Derived> start(int size) const;
typename SubVectorReturnType<Dynamic>::Type start(int size);
const typename SubVectorReturnType<Dynamic>::Type start(int size) const;
Block<Derived> end(int size);
const Block<Derived> end(int size) const;
typename SubVectorReturnType<Dynamic>::Type end(int size);
const typename SubVectorReturnType<Dynamic>::Type end(int size) const;
Block<Derived> corner(CornerType type, int cRows, int cCols);
const Block<Derived> corner(CornerType type, int cRows, int cCols) const;

View File

@ -76,6 +76,16 @@ template<typename ExpressionType> class NestByValue
return m_expression.const_cast_derived().coeffRef(row, col);
}
inline const Scalar _coeff(int index) const
{
return m_expression.coeff(index);
}
inline Scalar& _coeffRef(int index)
{
return m_expression.const_cast_derived().coeffRef(index);
}
template<int LoadMode>
inline const PacketScalar _packet(int row, int col) const
{
@ -88,6 +98,18 @@ template<typename ExpressionType> class NestByValue
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
}
template<int LoadMode>
inline const PacketScalar _packet(int index) const
{
return m_expression.template packet<LoadMode>(index);
}
template<int LoadMode>
inline void _writePacket(int index, const PacketScalar& x)
{
m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
}
protected:
const ExpressionType m_expression;
};

View File

@ -151,8 +151,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
&& (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
| LinearAccessBit),
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
| EvalBeforeAssigningBit
@ -224,6 +223,18 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
return res;
}
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
* which is why we don't set the LinearAccessBit.
*/
const Scalar _coeff(int index) const
{
Scalar res;
const int row = RowsAtCompileTime == 1 ? 0 : index;
const int col = RowsAtCompileTime == 1 ? index : 0;
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
return res;
}
template<int LoadMode>
const PacketScalar _packet(int row, int col) const
{
@ -235,9 +246,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
return res;
}
template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
friend struct ei_cache_friendly_selector;
protected:
const LhsNested m_lhs;
const RhsNested m_rhs;

View File

@ -186,34 +186,15 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
const int size = mat.size();
const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
const bool rowMajor = Derived::Flags&RowMajorBit;
const int innerSize = rowMajor ? mat.cols() : mat.rows();
const int outerSize = rowMajor ? mat.rows() : mat.cols();
Scalar res;
// do the vectorizable part of the sum
if(size >= packetSize)
{
PacketScalar packet_res;
packet_res = mat.template packet<Aligned>(0, 0);
int row = 0;
int col = 0;
int index = packetSize;
while (index<alignedSize)
{
row = rowMajor ? index/innerSize : index%innerSize;
col = rowMajor ? index%innerSize : index/innerSize;
int start = rowMajor ? col : row;
int end = std::min(innerSize, start+alignedSize-index);
if (end<start) getchar();
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
index += (rowMajor ? col : row) - start;
}
res = ei_predux(packet_res);
PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
for(int index = packetSize; index < alignedSize; index += packetSize)
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
// now we must do the rest without vectorization.
if(alignedSize == size) return res;
res = ei_predux(packet_res);
}
else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
@ -221,25 +202,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
res = Scalar(0);
}
const int k = alignedSize/innerSize;
// do the remainder of the current row or col
for(int i = alignedSize%innerSize; i < innerSize; i++)
for(int index = alignedSize; index < size; index++)
{
const int row = rowMajor ? k : i;
const int col = rowMajor ? i : k;
res += mat.coeff(row, col);
res += mat.coeff(index);
}
// do the remaining rows or cols
for(int j = k+1; j < outerSize; j++)
for(int i = 0; i < innerSize; i++)
{
const int row = rowMajor ? i : j;
const int col = rowMajor ? j : i;
res += mat.coeff(row, col);
}
return res;
}
};

View File

@ -56,26 +56,60 @@ const unsigned int EvalBeforeNestingBit = 0x2;
* means the expression should be evaluated before any assignement */
const unsigned int EvalBeforeAssigningBit = 0x4;
/** \ingroup flags
*
* currently unused. Means the matrix probably has a very big size.
* Could eventually be used as a hint to determine which algorithms
* to use. */
const unsigned int LargeBit = 0x8;
#ifdef EIGEN_VECTORIZE
/** \ingroup flags
*
* means the expression might be vectorized */
const unsigned int PacketAccessBit = 0x10;
* Short version: means the expression might be vectorized
*
* Long version: means that the coefficients can be handled by packets
* and start at a memory location whose alignment meets the requirements
* of the present CPU architecture for optimized packet access. In the fixed-size
* case, there is the additional condition that the total size of the coefficients
* array is a multiple of the packet size, so that it is possible to access all the
* coefficients by packets. In the dynamic-size case, there is no such condition
* on the total size, so it might not be possible to access the few last coeffs
* by packets.
*
* \note If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
* is set to the value 0.
*/
const unsigned int PacketAccessBit = 0x8;
#else
const unsigned int PacketAccessBit = 0x0;
#endif
/** \ingroup flags
*
* means the expression can be seen as 1D vector (used for explicit vectorization) */
const unsigned int LinearAccessBit = 0x20;
* Short version: means the expression can be seen as 1D vector.
*
* Long version: means that one can access the coefficients
* of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
* index-based access methods are guaranteed
* to not have to do any runtime computation of a (row, col)-pair from the index, so that it
* is guaranteed that whenever it is available, index-based access is at least as fast as
* (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
*
* If both PacketAccessBit and LinearAccessBit are set, then the
* packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
* lvalue expression.
*
* Typically, all vector expressions have the LinearAccessBit, but there is one exception:
* Product expressions don't have it, because it would be troublesome for vectorization, even when the
* Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
* not index-based packet access, so they don't have the LinearAccessBit.
*/
const unsigned int LinearAccessBit = 0x10;
/** \ingroup flags
*
* Means that the underlying array of coefficients can be directly accessed. This means two things.
* First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
* expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
* array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
* This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
* such a regular memory layout.
*/
const unsigned int DirectAccessBit = 0x20;
/** \ingroup flags
*
@ -102,19 +136,19 @@ const unsigned int UpperTriangularBit = 0x200;
* means the strictly upper triangular part is 0 */
const unsigned int LowerTriangularBit = 0x400;
/** \ingroup flags
*
* means the underlying matrix data can be direclty accessed (contrary to certain
* expressions where the matrix coefficients need to be computed rather than just read from
* memory) */
const unsigned int DirectAccessBit = 0x800;
/** \ingroup flags
*
* means the object is just an array of scalars, and operations on it are regarded as operations
* on every of these scalars taken separately.
*/
const unsigned int ArrayBit = 0x1000;
const unsigned int ArrayBit = 0x800;
/** \ingroup flags
*
* currently unused. Means the matrix probably has a very big size.
* Could eventually be used as a hint to determine which algorithms
* to use. */
const unsigned int LargeBit = 0x1000;
// list of flags that are inherited by default
const unsigned int HereditaryBits = RowMajorBit
@ -135,7 +169,6 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
const unsigned int Diagonal = Upper | Lower;
enum { Aligned=0, UnAligned=1 };
enum { ConditionalJumpCost = 5 };
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };

View File

@ -155,7 +155,8 @@ class ei_corrected_matrix_flags
// so let us strictly honor the user's choice.
? SuggestedFlags&RowMajorBit
: Cols > 1 ? RowMajorBit : 0,
is_big = MaxRows == Dynamic || MaxCols == Dynamic,
inner_max_size = row_major_bit ? MaxCols : MaxRows,
is_big = inner_max_size == Dynamic,
linear_size = Cols * Rows,
packet_access_bit
= ei_packet_traits<Scalar>::size > 1

View File

@ -75,7 +75,9 @@
// static assertion failling if the type \a TYPE is not a vector type
#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime,you_tried_calling_a_vector_method_on_a_matrix)
#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
you_tried_calling_a_vector_method_on_a_matrix)
// static assertion failling if the two vector expression types are not compatible (same fixed-size or dynamic size)
#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \

134
bench/benchVecAdd.cpp Normal file
View File

@ -0,0 +1,134 @@
#include <Eigen/Core>
#include <bench/BenchTimer.h>
using namespace Eigen;
#ifndef SIZE
#define SIZE 50
#endif
#ifndef REPEAT
#define REPEAT 10000
#endif
typedef float Scalar;
__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
int main(int argc, char* argv[])
{
int size = SIZE * 8;
int size2 = size * size;
Scalar* a = ei_aligned_malloc<Scalar>(size2);
Scalar* b = ei_aligned_malloc<Scalar>(size2);
Scalar* c = ei_aligned_malloc<Scalar>(size2);
for (int i=0; i<size; ++i)
{
a[i] = b[i] = c[i] = 0;
}
BenchTimer timer;
timer.reset();
for (int k=0; k<3; ++k)
{
timer.start();
benchVec(a, b, c, size2);
timer.stop();
}
std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
for (int innersize = size; innersize>2 ; --innersize)
{
if (size2%innersize==0)
{
int outersize = size2/innersize;
MatrixXf ma = MatrixXf::map(a, innersize, outersize );
MatrixXf mb = MatrixXf::map(b, innersize, outersize );
MatrixXf mc = MatrixXf::map(c, innersize, outersize );
timer.reset();
for (int k=0; k<3; ++k)
{
timer.start();
benchVec(ma, mb, mc);
timer.stop();
}
std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
}
}
VectorXf va = VectorXf::map(a, size2);
VectorXf vb = VectorXf::map(b, size2);
VectorXf vc = VectorXf::map(c, size2);
timer.reset();
for (int k=0; k<3; ++k)
{
timer.start();
benchVec(va, vb, vc);
timer.stop();
}
std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
return 0;
}
void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
{
for (int k=0; k<REPEAT; ++k)
a = a + b;
}
void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
{
for (int k=0; k<REPEAT; ++k)
a = a + b;
}
void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
{
typedef ei_packet_traits<Scalar>::type PacketScalar;
const int PacketSize = ei_packet_traits<Scalar>::size;
PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
for (int k=0; k<REPEAT; ++k)
for (int i=0; i<size; i+=PacketSize*8)
{
a0 = ei_pload(&a[i]);
b0 = ei_pload(&b[i]);
a1 = ei_pload(&a[i+1*PacketSize]);
b1 = ei_pload(&b[i+1*PacketSize]);
a2 = ei_pload(&a[i+2*PacketSize]);
b2 = ei_pload(&b[i+2*PacketSize]);
a3 = ei_pload(&a[i+3*PacketSize]);
b3 = ei_pload(&b[i+3*PacketSize]);
ei_pstore(&a[i], ei_padd(a0, b0));
a0 = ei_pload(&a[i+4*PacketSize]);
b0 = ei_pload(&b[i+4*PacketSize]);
ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
a1 = ei_pload(&a[i+5*PacketSize]);
b1 = ei_pload(&b[i+5*PacketSize]);
ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
a2 = ei_pload(&a[i+6*PacketSize]);
b2 = ei_pload(&b[i+6*PacketSize]);
ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
a3 = ei_pload(&a[i+7*PacketSize]);
b3 = ei_pload(&b[i+7*PacketSize]);
ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
// ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
// ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
// ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
// ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
// ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
// ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
}
}