Enable vectorization of product with dynamic matrices,

extended cache optimal product to work in any row/column
major situations, and a few bugfixes (forgot to add the
Cholesky header, vectorization of CwiseBinary)
This commit is contained in:
Gael Guennebaud 2008-05-01 13:53:05 +00:00
parent 6486991ac3
commit 02f1615d2a
5 changed files with 144 additions and 56 deletions

13
Eigen/Cholesky Normal file
View File

@ -0,0 +1,13 @@
#ifndef EIGEN_CHOLESKY_MODULE_H
#define EIGEN_CHOLESKY_MODULE_H
#include "Core"
namespace Eigen {
#include "Eigen/src/Cholesky/Cholesky.h"
#include "Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h"
} // namespace Eigen
#endif // EIGEN_CHOLESKY_MODULE_H

View File

@ -112,7 +112,8 @@ template<typename OtherDerived>
Derived& MatrixBase<Derived> Derived& MatrixBase<Derived>
::lazyAssign(const MatrixBase<OtherDerived>& other) ::lazyAssign(const MatrixBase<OtherDerived>& other)
{ {
// std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n"; // std::cout << typeid(OtherDerived).name() << "\n";
// std::cout << "lazyAssign = " << (Derived::Flags&VectorizableBit) << " " << (OtherDerived::Flags&VectorizableBit) << "\n";
ei_assignment_impl<Derived, OtherDerived>::execute(derived(),other.derived()); ei_assignment_impl<Derived, OtherDerived>::execute(derived(),other.derived());
return derived(); return derived();
} }

View File

@ -64,7 +64,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
DefaultLostFlagMask DefaultLostFlagMask
| Like1DArrayBit | Like1DArrayBit
| (ei_functor_traits<BinaryOp>::IsVectorizable && ((Lhs::Flags & RowMajorBit)==(Rhs::Flags & RowMajorBit)) | (ei_functor_traits<BinaryOp>::IsVectorizable && ((Lhs::Flags & RowMajorBit)==(Rhs::Flags & RowMajorBit))
? VectorizableBit : 0))), ? Lhs::Flags & Rhs::Flags & VectorizableBit : 0))),
CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + ei_functor_traits<BinaryOp>::Cost CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + ei_functor_traits<BinaryOp>::Cost
}; };
}; };

View File

@ -78,7 +78,7 @@ template<typename ExpressionType> class Lazy
} }
protected: protected:
const typename ExpressionType::Nested m_expression; const ExpressionType& m_expression;
}; };
/** \returns an expression of the lazy version of *this. /** \returns an expression of the lazy version of *this.

View File

@ -108,7 +108,9 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
*/ */
template<typename Lhs, typename Rhs> struct ei_product_eval_mode template<typename Lhs, typename Rhs> struct ei_product_eval_mode
{ {
enum{ value = Lhs::MaxRowsAtCompileTime >= 16 && Rhs::MaxColsAtCompileTime >= 16 enum{ value = Lhs::MaxRowsAtCompileTime >= 16
&& Rhs::MaxColsAtCompileTime >= 16
&& (!( (Lhs::Flags&RowMajorBit) && (Rhs::Flags&RowMajorBit xor RowMajorBit)))
? CacheOptimalProduct : NormalProduct }; ? CacheOptimalProduct : NormalProduct };
}; };
@ -129,29 +131,18 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
ColsAtCompileTime = Rhs::ColsAtCompileTime, ColsAtCompileTime = Rhs::ColsAtCompileTime,
MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime, MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime, MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
Flags = (( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
? (unsigned int)(LhsFlags | RhsFlags) _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
: (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit ) _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
_RowMajor = (RhsFlags & RowMajorBit)
&& (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
_LostBits = DefaultLostFlagMask & ~(
(_RowMajor ? 0 : RowMajorBit)
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
| EvalBeforeAssigningBit | EvalBeforeAssigningBit
| (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0)) | ((int)EvalMode == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0)
& ( | (_Vectorizable ? VectorizableBit : 0),
DefaultLostFlagMask & (~RowMajorBit)
| (
(
(!(Lhs::Flags & RowMajorBit)) && (Lhs::Flags & VectorizableBit)
&& (Lhs::RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
)
? VectorizableBit
: (
(
(Rhs::Flags & RowMajorBit) && (Rhs::Flags & VectorizableBit)
&& (Lhs::ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
)
? RowMajorBit | VectorizableBit
: 0
)
)
),
CoeffReadCost CoeffReadCost
= Lhs::ColsAtCompileTime == Dynamic = Lhs::ColsAtCompileTime == Dynamic
? Dynamic ? Dynamic
@ -278,11 +269,7 @@ Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProdu
{ {
product._cacheOptimalEval(*this, product._cacheOptimalEval(*this,
#ifdef EIGEN_VECTORIZE #ifdef EIGEN_VECTORIZE
typename ei_meta_if<(Flags & VectorizableBit) typename ei_meta_if<Flags & VectorizableBit, ei_meta_true, ei_meta_false>::ret()
&& (!(Lhs::Flags & RowMajorBit)
&& (Lhs::RowsAtCompileTime!=Dynamic)
&& (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ),
ei_meta_true,ei_meta_false>::ret()
#else #else
ei_meta_false() ei_meta_false()
#endif #endif
@ -296,21 +283,53 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_fals
{ {
res.setZero(); res.setZero();
const int cols4 = m_lhs.cols() & 0xfffffffC; const int cols4 = m_lhs.cols() & 0xfffffffC;
if (Lhs::Flags&RowMajorBit)
{ {
for(int k=0; k<this->cols(); ++k) // std::cout << "opt rhs\n";
int j=0;
for(; j<cols4; j+=4)
{ {
int j=0; for(int k=0; k<this->rows(); ++k)
for(; j<cols4; j+=4) {
const Scalar tmp0 = m_lhs.coeff(k,j );
const Scalar tmp1 = m_lhs.coeff(k,j+1);
const Scalar tmp2 = m_lhs.coeff(k,j+2);
const Scalar tmp3 = m_lhs.coeff(k,j+3);
for (int i=0; i<this->cols(); ++i)
res.coeffRef(k,i) += tmp0 * m_rhs.coeff(j+0,i) + tmp1 * m_rhs.coeff(j+1,i)
+ tmp2 * m_rhs.coeff(j+2,i) + tmp3 * m_rhs.coeff(j+3,i);
}
}
for(; j<m_lhs.cols(); ++j)
{
for(int k=0; k<this->rows(); ++k)
{
const Scalar tmp = m_rhs.coeff(k,j);
for (int i=0; i<this->cols(); ++i)
res.coeffRef(k,i) += tmp * m_lhs.coeff(j,i);
}
}
}
else
{
// std::cout << "opt lhs\n";
int j = 0;
for(; j<cols4; j+=4)
{
for(int k=0; k<this->cols(); ++k)
{ {
const Scalar tmp0 = m_rhs.coeff(j ,k); const Scalar tmp0 = m_rhs.coeff(j ,k);
const Scalar tmp1 = m_rhs.coeff(j+1,k); const Scalar tmp1 = m_rhs.coeff(j+1,k);
const Scalar tmp2 = m_rhs.coeff(j+2,k); const Scalar tmp2 = m_rhs.coeff(j+2,k);
const Scalar tmp3 = m_rhs.coeff(j+3,k); const Scalar tmp3 = m_rhs.coeff(j+3,k);
for (int i=0; i<this->rows(); ++i) for (int i=0; i<this->rows(); ++i)
res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j+0) + tmp1 * m_lhs.coeff(i,j+1)
+ tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
} }
for(; j<m_lhs.cols(); ++j) }
for(; j<m_lhs.cols(); ++j)
{
for(int k=0; k<this->cols(); ++k)
{ {
const Scalar tmp = m_rhs.coeff(j,k); const Scalar tmp = m_rhs.coeff(j,k);
for (int i=0; i<this->rows(); ++i) for (int i=0; i<this->rows(); ++i)
@ -325,40 +344,95 @@ template<typename Lhs, typename Rhs, int EvalMode>
template<typename DestDerived> template<typename DestDerived>
void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
{ {
if (((Lhs::Flags&RowMajorBit) && (_cols() % ei_packet_traits<Scalar>::size != 0))
|| (_rows() % ei_packet_traits<Scalar>::size != 0))
{
return _cacheOptimalEval(res, ei_meta_false());
}
res.setZero(); res.setZero();
const int cols4 = m_lhs.cols() & 0xfffffffC; const int cols4 = m_lhs.cols() & 0xfffffffC;
for(int k=0; k<this->cols(); k++) if (Lhs::Flags&RowMajorBit)
{ {
// std::cout << "packet rhs\n";
int j=0; int j=0;
for(; j<cols4; j+=4) for(; j<cols4; j+=4)
{ {
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); for(int k=0; k<this->rows(); k++)
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
{ {
res.writePacketCoeff(i,k,\ const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_lhs.coeff(k,j+0));
ei_padd( const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_lhs.coeff(k,j+1));
res.packetCoeff(i,k), const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_lhs.coeff(k,j+2));
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
{
res.writePacketCoeff(k,i,
ei_padd( ei_padd(
res.packetCoeff(k,i),
ei_padd( ei_padd(
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), ei_padd(
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
ei_padd( ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), ei_padd(
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
)
) )
) )
) );
); }
} }
} }
for(; j<m_lhs.cols(); ++j) for(; j<m_lhs.cols(); ++j)
{ {
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); for(int k=0; k<this->rows(); k++)
for (int i=0; i<this->rows(); ++i) {
res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
}
}
}
else
{
// std::cout << "packet lhs\n";
int j=0;
for(; j<cols4; j+=4)
{
for(int k=0; k<this->cols(); k++)
{
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
{
res.writePacketCoeff(i,k,
ei_padd(
res.packetCoeff(i,k),
ei_padd(
ei_padd(
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
ei_padd(
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
)
)
)
);
}
}
}
for(; j<m_lhs.cols(); ++j)
{
for(int k=0; k<this->cols(); k++)
{
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
}
} }
} }
} }