mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
cleaned Tridiagonalization impl, with improved performance
This commit is contained in:
parent
ed134a0ce5
commit
c4601314be
@ -220,100 +220,22 @@ void Tridiagonalization<MatrixType>::_compute(MatrixType& matA, CoeffVectorType&
|
||||
// i.e., A = H' A H where H = I - h v v' and v = matA.col(i).end(n-i-1)
|
||||
|
||||
matA.col(i).coeffRef(i+1) = 1;
|
||||
Scalar* EIGEN_RESTRICT t = &hCoeffs.coeffRef(-1);
|
||||
|
||||
/* This is the initial algorithm which minimize operation counts and maximize
|
||||
* the use of Eigen's expression. Unfortunately, the first matrix-vector product
|
||||
* using Part<LowerTriangular|Selfadjoint> is very very slow */
|
||||
#ifdef EIGEN_NEVER_DEFINED
|
||||
// matrix - vector product
|
||||
hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>()
|
||||
* (h * matA.col(i).end(n-i-1))).lazy();
|
||||
// simple axpy
|
||||
hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1)))
|
||||
* matA.col(i).end(n-i-1);
|
||||
// rank-2 update
|
||||
//Block<MatrixType,Dynamic,1> B(matA,i+1,i,n-i-1,1);
|
||||
matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular>() -=
|
||||
(matA.col(i).end(n-i-1) * hCoeffs.end(n-i-1).adjoint()).lazy()
|
||||
+ (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy();
|
||||
#endif
|
||||
/* end initial algorithm */
|
||||
// hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>()
|
||||
// * matA.col(i).end(n-i-1)).lazy();
|
||||
// TODO map the above code to the function call below:
|
||||
ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,LowerTriangularBit>
|
||||
(n-i-1,matA.corner(BottomRight,n-i-1,n-i-1).data(), matA.stride(), matA.col(i).end(n-i-1).data(), const_cast<Scalar*>(hCoeffs.end(n-i-1).data()));
|
||||
|
||||
/* If we still want to minimize operation count (i.e., perform operation on the lower part only)
|
||||
* then we could provide the following algorithm for selfadjoint - vector product. However, a full
|
||||
* matrix-vector product is still faster (at least for dynamic size, and not too small, did not check
|
||||
* small matrices). The algo performs block matrix-vector and transposed matrix vector products. */
|
||||
#ifdef EIGEN_NEVER_DEFINED
|
||||
int n4 = (std::max(0,n-4)/4)*4;
|
||||
hCoeffs.end(n-i-1).setZero();
|
||||
for (int b=i+1; b<n4; b+=4)
|
||||
{
|
||||
// the ?x4 part:
|
||||
hCoeffs.end(b-4) +=
|
||||
Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4) * matA.template block<4,1>(b,i);
|
||||
// the respective transposed part:
|
||||
Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) +=
|
||||
Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4).adjoint() * Block<MatrixType,Dynamic,1>(matA,b+4,i,n-b-4,1);
|
||||
// the 4x4 block diagonal:
|
||||
Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) +=
|
||||
(Block<MatrixType,4,4>(matA,b,b,4,4).template part<LowerTriangular|SelfAdjoint>()
|
||||
* (h * Block<MatrixType,4,1>(matA,b,i,4,1))).lazy();
|
||||
}
|
||||
#endif
|
||||
// todo: handle the remaining part
|
||||
/* end optimized selfadjoint - vector product */
|
||||
hCoeffs.end(n-i-1) = hCoeffs.end(n-i-1)*h
|
||||
+ (h*ei_conj(h)*Scalar(-0.5)*matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) *
|
||||
matA.col(i).end(n-i-1);
|
||||
|
||||
/* Another interesting note: the above rank-2 update is much slower than the following hand written loop.
|
||||
* After an analyze of the ASM, it seems GCC (4.2) generate poor code because of the Block. Moreover,
|
||||
* if we remove the specialization of Block for Matrix then it is even worse, much worse ! */
|
||||
#ifdef EIGEN_NEVER_DEFINED
|
||||
// symmetric rank-2 update
|
||||
for (int j1=i+1; j1<n; ++j1)
|
||||
for (int i1=j1; i1<n; ++i1)
|
||||
matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
|
||||
+ hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
|
||||
#endif
|
||||
/* end hand writen partial rank-2 update */
|
||||
|
||||
/* The current fastest implementation: the full matrix is used, no "optimization" to use/compute
|
||||
* only half of the matrix. Custom vectorization of the inner col -= alpha X + beta Y such that access
|
||||
* to col are always aligned. Once we support that in Assign, then the algorithm could be rewriten as
|
||||
* a single compact expression. This code is therefore a good benchmark when will do that. */
|
||||
|
||||
// let's use the end of hCoeffs to store temporary values:
|
||||
hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1) * (h * matA.col(i).end(n-i-1))).lazy();
|
||||
// FIXME in the above expr a temporary is created because of the scalar multiple by h
|
||||
|
||||
hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1)))
|
||||
* matA.col(i).end(n-i-1);
|
||||
|
||||
const Scalar* EIGEN_RESTRICT pb = &matA.coeffRef(0,i);
|
||||
const Scalar* EIGEN_RESTRICT pa = (&hCoeffs.coeffRef(0)) - 1;
|
||||
for (int j1=i+1; j1<n; ++j1)
|
||||
{
|
||||
int starti = i+1;
|
||||
int alignedEnd = starti;
|
||||
if (PacketSize>1)
|
||||
{
|
||||
int alignedStart = (starti) + ei_alignmentOffset(&matA.coeffRef(starti,j1), n-starti);
|
||||
alignedEnd = alignedStart + ((n-alignedStart)/PacketSize)*PacketSize;
|
||||
|
||||
for (int i1=starti; i1<alignedStart; ++i1)
|
||||
matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
|
||||
+ hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
|
||||
|
||||
Packet tmp0 = ei_pset1(hCoeffs.coeff(j1-1));
|
||||
Packet tmp1 = ei_pset1(matA.coeff(j1,i));
|
||||
Scalar* pc = &matA.coeffRef(0,j1);
|
||||
for (int i1=alignedStart ; i1<alignedEnd; i1+=PacketSize)
|
||||
ei_pstore(pc+i1,ei_psub(ei_pload(pc+i1),
|
||||
ei_padd(ei_pmul(tmp0, ei_ploadu(pb+i1)),
|
||||
ei_pmul(tmp1, ei_ploadu(pa+i1)))));
|
||||
}
|
||||
for (int i1=alignedEnd; i1<n; ++i1)
|
||||
matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
|
||||
+ hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
|
||||
}
|
||||
/* end optimized implementation */
|
||||
matA.col(j1).end(n-j1) -= matA.col(i).end(n-j1) * ei_conj(hCoeffs.coeff(j1-1))
|
||||
+ hCoeffs.end(n-j1) * ei_conj(matA.coeff(j1,i));
|
||||
|
||||
// note: at that point matA(i+1,i+1) is the (i+1)-th element of the final diagonal
|
||||
// note: the sequence of the beta values leads to the subdiagonal entries
|
||||
|
Loading…
x
Reference in New Issue
Block a user