* Fix some complex alignment issues in the cache friendly matrix-vector products.

* Minor update of the cores of the Cholesky algorithms to make them more friendly
  wrt to matrix-vector products => speedup x5 !
This commit is contained in:
Gael Guennebaud 2008-07-23 17:30:00 +00:00
parent 172000aaeb
commit b466c266a0
4 changed files with 93 additions and 48 deletions

View File

@ -48,24 +48,28 @@
*/
template<typename MatrixType> class Cholesky
{
public:
private:
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
enum {
PacketSize = ei_packet_traits<Scalar>::size,
AlignmentMask = int(PacketSize)-1
};
public:
Cholesky(const MatrixType& matrix)
: m_matrix(matrix.rows(), matrix.cols())
{
compute(matrix);
}
Part<MatrixType, Lower> matrixL(void) const
{
return m_matrix;
}
inline Part<MatrixType, Lower> matrixL(void) const { return m_matrix; }
bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
/** \returns true if the matrix is positive definite */
inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
template<typename Derived>
typename Derived::Eval solve(const MatrixBase<Derived> &b) const;
@ -92,20 +96,30 @@ void Cholesky<MatrixType>::compute(const MatrixType& a)
RealScalar x;
x = ei_real(a.coeff(0,0));
m_isPositiveDefinite = x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1));
m_isPositiveDefinite = x > precision<Scalar>() && ei_isMuchSmallerThan(ei_imag(m_matrix.coeff(0,0)), RealScalar(1));
m_matrix.coeffRef(0,0) = ei_sqrt(x);
m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / m_matrix.coeff(0,0);
m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / ei_real(m_matrix.coeff(0,0));
for (int j = 1; j < size; ++j)
{
Scalar tmp = ei_real(a.coeff(j,j)) - m_matrix.row(j).start(j).norm2();
x = ei_real(tmp);
m_isPositiveDefinite = m_isPositiveDefinite && x > RealScalar(0) && ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1));
if (x < precision<Scalar>() || (!ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1))))
{
m_isPositiveDefinite = m_isPositiveDefinite;
return;
}
m_matrix.coeffRef(j,j) = x = ei_sqrt(x);
int endSize = size-j-1;
if (endSize>0)
if (endSize>0) {
// Note that when all matrix columns have good alignment, then the following
// product is guaranteed to be optimal with respect to alignment.
m_matrix.col(j).end(endSize) =
(m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()).lazy();
m_matrix.col(j).end(endSize) = (a.row(j).end(endSize).adjoint()
- m_matrix.block(j+1, 0, endSize, j) * m_matrix.row(j).start(j).adjoint()) / x;
- m_matrix.col(j).end(endSize) ) / x;
}
}
}

View File

@ -60,23 +60,13 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot
}
/** \returns the lower triangular matrix L */
Part<MatrixType, UnitLower> matrixL(void) const
{
return m_matrix;
}
inline Part<MatrixType, UnitLower> matrixL(void) const { return m_matrix; }
/** \returns the coefficients of the diagonal matrix D */
DiagonalCoeffs<MatrixType> vectorD(void) const
{
return m_matrix.diagonal();
}
inline DiagonalCoeffs<MatrixType> vectorD(void) const { return m_matrix.diagonal(); }
/** \returns whether the matrix is positive definite */
bool isPositiveDefinite(void) const
{
// FIXME is it really correct ?
return m_matrix.diagonal().real().minCoeff() > RealScalar(0);
}
/** \returns true if the matrix is positive definite */
inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
template<typename Derived>
typename Derived::Eval solve(const MatrixBase<Derived> &b) const;
@ -91,6 +81,8 @@ template<typename MatrixType> class CholeskyWithoutSquareRoot
* is not stored), and the diagonal entries correspond to D.
*/
MatrixType m_matrix;
bool m_isPositiveDefinite;
};
/** Compute / recompute the Cholesky decomposition A = L D L^* = U^* D U of \a matrix
@ -101,6 +93,13 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a)
assert(a.rows()==a.cols());
const int size = a.rows();
m_matrix.resize(size, size);
m_isPositiveDefinite = true;
// Let's preallocate a temporay vector to evaluate the matrix-vector product into it.
// Unlike the standard Cholesky decomposition, here we cannot evaluate it to the destination
// matrix because it a sub-row which is not compatible suitable for efficient packet evaluation.
// (at least if we assume the matrix is col-major)
Matrix<Scalar,MatrixType::RowsAtCompileTime,1> _temporary(size);
// Note that, in this algorithm the rows of the strict upper part of m_matrix is used to store
// column vector, thus the strange .conjugate() and .transpose()...
@ -112,11 +111,21 @@ void CholeskyWithoutSquareRoot<MatrixType>::compute(const MatrixType& a)
RealScalar tmp = ei_real(a.coeff(j,j) - (m_matrix.row(j).start(j) * m_matrix.col(j).start(j).conjugate()).coeff(0,0));
m_matrix.coeffRef(j,j) = tmp;
if (ei_isMuchSmallerThan(tmp,RealScalar(1)))
{
m_isPositiveDefinite = false;
return;
}
int endSize = size-j-1;
if (endSize>0)
{
_temporary.end(endSize) = ( m_matrix.block(j+1,0, endSize, j)
* m_matrix.col(j).start(j).conjugate() ).lazy();
m_matrix.row(j).end(endSize) = a.row(j).end(endSize).conjugate()
- (m_matrix.block(j+1,0, endSize, j) * m_matrix.col(j).start(j).conjugate()).transpose();
- _temporary.end(endSize).transpose();
m_matrix.col(j).end(endSize) = m_matrix.row(j).end(endSize) / tmp;
}
}

View File

@ -359,6 +359,19 @@ static void ei_cache_friendly_product(
#endif // EIGEN_EXTERN_INSTANTIATIONS
template<typename Scalar>
inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
{
typedef typename ei_packet_traits<Scalar>::type Packet;
const int PacketSize = ei_packet_traits<Scalar>::size;
const int PacketAlignedMask = PacketSize-1;
const bool Vectorized = PacketSize>1;
return Vectorized
? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
& PacketAlignedMask, maxOffset)
: 0;
}
/* Optimized col-major matrix * vector product:
* This algorithm processes 4 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
@ -397,9 +410,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
const int alignedStart = Vectorized
? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
: 0;
const int alignedStart = ei_alignmentOffset(res,size);
const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
@ -408,9 +419,13 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
: alignmentStep==2 ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
// find how many columns do we have to skip to be aligned with the result (if possible)
int skipColumns=0;
for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns)
for (; skipColumns<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipColumns; ++skipColumns)
{}
if (skipColumns==PacketSize)
{
@ -423,6 +438,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
skipColumns = std::min(skipColumns,rhs.size());
// note that the skiped columns are processed later.
}
ei_internal_assert((alignmentPattern==NoneAligned)
|| size_t(lhs+alignedStart+alignmentStep*skipColumns)%sizeof(Packet)==0);
int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
@ -500,7 +519,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
res[j] += ei_pfirst(ptmp0) * lhs0[j];
// process aligned result's coeffs
if (((i*lhsStride) % PacketSize+alignedStart) == 0)
if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j])));
else
@ -557,9 +576,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
const int alignedStart = Vectorized
? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
: 0;
const int alignedStart = ei_alignmentOffset(rhs, size);
const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
//const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
@ -568,9 +585,12 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
: alignmentStep==2 ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
// find how many rows do we have to skip to be aligned with rhs (if possible)
int skipRows=0;
for (; skipRows<PacketSize && alignedStart != alignmentStep*skipRows; ++skipRows)
for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows)
{}
if (skipRows==PacketSize)
{
@ -583,6 +603,8 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
skipRows = std::min(skipRows,res.size());
// note that the skiped columns are processed later.
}
ei_internal_assert((alignmentPattern==NoneAligned)
|| size_t(lhs+alignedStart+alignmentStep*skipRows)%sizeof(Packet)==0);
int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
@ -652,7 +674,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
if ((i*lhsStride+alignedStart) % PacketSize==0)
if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0);
else

View File

@ -34,7 +34,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m)
typedef typename MatrixType::Scalar Scalar;
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
MatrixType a = MatrixType::random(rows,cols);
MatrixType a = MatrixType::Random(rows,cols);
SquareMatrixType covMat = a * a.adjoint();
BenchTimer timerNoSqrt, timerSqrt;
@ -108,7 +108,7 @@ __attribute__ ((noinline)) void benchCholesky(const MatrixType& m)
int main(int argc, char* argv[])
{
const int dynsizes[] = {4,6,8,12,16,24,32,64,128,256,512,0};
const int dynsizes[] = {/*4,6,8,12,16,24,32,49,64,67,128,129,130,131,132,*/256,257,258,259,260,512,0};
std::cout << "size no sqrt standard";
#ifdef BENCH_GSL
std::cout << " GSL (standard + double + ATLAS) ";
@ -118,15 +118,15 @@ int main(int argc, char* argv[])
for (uint i=0; dynsizes[i]>0; ++i)
benchCholesky(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));
benchCholesky(Matrix<Scalar,2,2>());
benchCholesky(Matrix<Scalar,3,3>());
benchCholesky(Matrix<Scalar,4,4>());
benchCholesky(Matrix<Scalar,5,5>());
benchCholesky(Matrix<Scalar,6,6>());
benchCholesky(Matrix<Scalar,7,7>());
benchCholesky(Matrix<Scalar,8,8>());
benchCholesky(Matrix<Scalar,12,12>());
benchCholesky(Matrix<Scalar,16,16>());
// benchCholesky(Matrix<Scalar,2,2>());
// benchCholesky(Matrix<Scalar,3,3>());
// benchCholesky(Matrix<Scalar,4,4>());
// benchCholesky(Matrix<Scalar,5,5>());
// benchCholesky(Matrix<Scalar,6,6>());
// benchCholesky(Matrix<Scalar,7,7>());
// benchCholesky(Matrix<Scalar,8,8>());
// benchCholesky(Matrix<Scalar,12,12>());
// benchCholesky(Matrix<Scalar,16,16>());
return 0;
}