mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-10 02:39:03 +08:00
Enable vectorization of product with dynamic matrices,
extended cache optimal product to work in any row/column major situations, and a few bugfixes (forgot to add the Cholesky header, vectorization of CwiseBinary)
This commit is contained in:
parent
6486991ac3
commit
02f1615d2a
13
Eigen/Cholesky
Normal file
13
Eigen/Cholesky
Normal file
@ -0,0 +1,13 @@
|
||||
#ifndef EIGEN_CHOLESKY_MODULE_H
|
||||
#define EIGEN_CHOLESKY_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
#include "Eigen/src/Cholesky/Cholesky.h"
|
||||
#include "Eigen/src/Cholesky/CholeskyWithoutSquareRoot.h"
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CHOLESKY_MODULE_H
|
@ -112,7 +112,8 @@ template<typename OtherDerived>
|
||||
Derived& MatrixBase<Derived>
|
||||
::lazyAssign(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
// std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n";
|
||||
// std::cout << typeid(OtherDerived).name() << "\n";
|
||||
// std::cout << "lazyAssign = " << (Derived::Flags&VectorizableBit) << " " << (OtherDerived::Flags&VectorizableBit) << "\n";
|
||||
ei_assignment_impl<Derived, OtherDerived>::execute(derived(),other.derived());
|
||||
return derived();
|
||||
}
|
||||
|
@ -64,7 +64,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
|
||||
DefaultLostFlagMask
|
||||
| Like1DArrayBit
|
||||
| (ei_functor_traits<BinaryOp>::IsVectorizable && ((Lhs::Flags & RowMajorBit)==(Rhs::Flags & RowMajorBit))
|
||||
? VectorizableBit : 0))),
|
||||
? Lhs::Flags & Rhs::Flags & VectorizableBit : 0))),
|
||||
CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + ei_functor_traits<BinaryOp>::Cost
|
||||
};
|
||||
};
|
||||
|
@ -78,7 +78,7 @@ template<typename ExpressionType> class Lazy
|
||||
}
|
||||
|
||||
protected:
|
||||
const typename ExpressionType::Nested m_expression;
|
||||
const ExpressionType& m_expression;
|
||||
};
|
||||
|
||||
/** \returns an expression of the lazy version of *this.
|
||||
|
@ -108,7 +108,9 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
|
||||
*/
|
||||
template<typename Lhs, typename Rhs> struct ei_product_eval_mode
|
||||
{
|
||||
enum{ value = Lhs::MaxRowsAtCompileTime >= 16 && Rhs::MaxColsAtCompileTime >= 16
|
||||
enum{ value = Lhs::MaxRowsAtCompileTime >= 16
|
||||
&& Rhs::MaxColsAtCompileTime >= 16
|
||||
&& (!( (Lhs::Flags&RowMajorBit) && (Rhs::Flags&RowMajorBit xor RowMajorBit)))
|
||||
? CacheOptimalProduct : NormalProduct };
|
||||
};
|
||||
|
||||
@ -129,29 +131,18 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
|
||||
ColsAtCompileTime = Rhs::ColsAtCompileTime,
|
||||
MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
|
||||
Flags = (( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
|
||||
? (unsigned int)(LhsFlags | RhsFlags)
|
||||
: (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit )
|
||||
_RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
||||
_LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
||||
_Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
|
||||
_RowMajor = (RhsFlags & RowMajorBit)
|
||||
&& (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
|
||||
_LostBits = DefaultLostFlagMask & ~(
|
||||
(_RowMajor ? 0 : RowMajorBit)
|
||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
||||
Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
|
||||
| EvalBeforeAssigningBit
|
||||
| (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0))
|
||||
& (
|
||||
DefaultLostFlagMask & (~RowMajorBit)
|
||||
| (
|
||||
(
|
||||
(!(Lhs::Flags & RowMajorBit)) && (Lhs::Flags & VectorizableBit)
|
||||
&& (Lhs::RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
|
||||
)
|
||||
? VectorizableBit
|
||||
: (
|
||||
(
|
||||
(Rhs::Flags & RowMajorBit) && (Rhs::Flags & VectorizableBit)
|
||||
&& (Lhs::ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
|
||||
)
|
||||
? RowMajorBit | VectorizableBit
|
||||
: 0
|
||||
)
|
||||
)
|
||||
),
|
||||
| ((int)EvalMode == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0)
|
||||
| (_Vectorizable ? VectorizableBit : 0),
|
||||
CoeffReadCost
|
||||
= Lhs::ColsAtCompileTime == Dynamic
|
||||
? Dynamic
|
||||
@ -278,11 +269,7 @@ Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProdu
|
||||
{
|
||||
product._cacheOptimalEval(*this,
|
||||
#ifdef EIGEN_VECTORIZE
|
||||
typename ei_meta_if<(Flags & VectorizableBit)
|
||||
&& (!(Lhs::Flags & RowMajorBit)
|
||||
&& (Lhs::RowsAtCompileTime!=Dynamic)
|
||||
&& (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ),
|
||||
ei_meta_true,ei_meta_false>::ret()
|
||||
typename ei_meta_if<Flags & VectorizableBit, ei_meta_true, ei_meta_false>::ret()
|
||||
#else
|
||||
ei_meta_false()
|
||||
#endif
|
||||
@ -296,21 +283,53 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_fals
|
||||
{
|
||||
res.setZero();
|
||||
const int cols4 = m_lhs.cols() & 0xfffffffC;
|
||||
if (Lhs::Flags&RowMajorBit)
|
||||
{
|
||||
for(int k=0; k<this->cols(); ++k)
|
||||
// std::cout << "opt rhs\n";
|
||||
int j=0;
|
||||
for(; j<cols4; j+=4)
|
||||
{
|
||||
int j=0;
|
||||
for(; j<cols4; j+=4)
|
||||
for(int k=0; k<this->rows(); ++k)
|
||||
{
|
||||
const Scalar tmp0 = m_lhs.coeff(k,j );
|
||||
const Scalar tmp1 = m_lhs.coeff(k,j+1);
|
||||
const Scalar tmp2 = m_lhs.coeff(k,j+2);
|
||||
const Scalar tmp3 = m_lhs.coeff(k,j+3);
|
||||
for (int i=0; i<this->cols(); ++i)
|
||||
res.coeffRef(k,i) += tmp0 * m_rhs.coeff(j+0,i) + tmp1 * m_rhs.coeff(j+1,i)
|
||||
+ tmp2 * m_rhs.coeff(j+2,i) + tmp3 * m_rhs.coeff(j+3,i);
|
||||
}
|
||||
}
|
||||
for(; j<m_lhs.cols(); ++j)
|
||||
{
|
||||
for(int k=0; k<this->rows(); ++k)
|
||||
{
|
||||
const Scalar tmp = m_rhs.coeff(k,j);
|
||||
for (int i=0; i<this->cols(); ++i)
|
||||
res.coeffRef(k,i) += tmp * m_lhs.coeff(j,i);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "opt lhs\n";
|
||||
int j = 0;
|
||||
for(; j<cols4; j+=4)
|
||||
{
|
||||
for(int k=0; k<this->cols(); ++k)
|
||||
{
|
||||
const Scalar tmp0 = m_rhs.coeff(j ,k);
|
||||
const Scalar tmp1 = m_rhs.coeff(j+1,k);
|
||||
const Scalar tmp2 = m_rhs.coeff(j+2,k);
|
||||
const Scalar tmp3 = m_rhs.coeff(j+3,k);
|
||||
for (int i=0; i<this->rows(); ++i)
|
||||
res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
|
||||
+ tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
|
||||
res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j+0) + tmp1 * m_lhs.coeff(i,j+1)
|
||||
+ tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
|
||||
}
|
||||
for(; j<m_lhs.cols(); ++j)
|
||||
}
|
||||
for(; j<m_lhs.cols(); ++j)
|
||||
{
|
||||
for(int k=0; k<this->cols(); ++k)
|
||||
{
|
||||
const Scalar tmp = m_rhs.coeff(j,k);
|
||||
for (int i=0; i<this->rows(); ++i)
|
||||
@ -325,40 +344,95 @@ template<typename Lhs, typename Rhs, int EvalMode>
|
||||
template<typename DestDerived>
|
||||
void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
|
||||
{
|
||||
|
||||
if (((Lhs::Flags&RowMajorBit) && (_cols() % ei_packet_traits<Scalar>::size != 0))
|
||||
|| (_rows() % ei_packet_traits<Scalar>::size != 0))
|
||||
{
|
||||
return _cacheOptimalEval(res, ei_meta_false());
|
||||
}
|
||||
|
||||
res.setZero();
|
||||
const int cols4 = m_lhs.cols() & 0xfffffffC;
|
||||
for(int k=0; k<this->cols(); k++)
|
||||
if (Lhs::Flags&RowMajorBit)
|
||||
{
|
||||
// std::cout << "packet rhs\n";
|
||||
int j=0;
|
||||
for(; j<cols4; j+=4)
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
|
||||
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
|
||||
for(int k=0; k<this->rows(); k++)
|
||||
{
|
||||
res.writePacketCoeff(i,k,\
|
||||
ei_padd(
|
||||
res.packetCoeff(i,k),
|
||||
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_lhs.coeff(k,j+0));
|
||||
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_lhs.coeff(k,j+1));
|
||||
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_lhs.coeff(k,j+2));
|
||||
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
|
||||
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
|
||||
{
|
||||
res.writePacketCoeff(k,i,
|
||||
ei_padd(
|
||||
res.packetCoeff(k,i),
|
||||
ei_padd(
|
||||
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
|
||||
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
|
||||
ei_padd(
|
||||
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
|
||||
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
|
||||
ei_padd(
|
||||
ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
|
||||
ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
|
||||
ei_padd(
|
||||
ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
|
||||
ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(; j<m_lhs.cols(); ++j)
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
|
||||
for (int i=0; i<this->rows(); ++i)
|
||||
res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
|
||||
for(int k=0; k<this->rows(); k++)
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
|
||||
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
|
||||
res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "packet lhs\n";
|
||||
int j=0;
|
||||
for(; j<cols4; j+=4)
|
||||
{
|
||||
for(int k=0; k<this->cols(); k++)
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
|
||||
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
|
||||
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
|
||||
{
|
||||
res.writePacketCoeff(i,k,
|
||||
ei_padd(
|
||||
res.packetCoeff(i,k),
|
||||
ei_padd(
|
||||
ei_padd(
|
||||
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
|
||||
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
|
||||
ei_padd(
|
||||
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
|
||||
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(; j<m_lhs.cols(); ++j)
|
||||
{
|
||||
for(int k=0; k<this->cols(); k++)
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
|
||||
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
|
||||
res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user