diff --git a/Eigen/src/QR/Tridiagonalization.h b/Eigen/src/QR/Tridiagonalization.h index 599786302..54d705fd1 100644 --- a/Eigen/src/QR/Tridiagonalization.h +++ b/Eigen/src/QR/Tridiagonalization.h @@ -220,100 +220,22 @@ void Tridiagonalization::_compute(MatrixType& matA, CoeffVectorType& // i.e., A = H' A H where H = I - h v v' and v = matA.col(i).end(n-i-1) matA.col(i).coeffRef(i+1) = 1; + Scalar* EIGEN_RESTRICT t = &hCoeffs.coeffRef(-1); - /* This is the initial algorithm which minimize operation counts and maximize - * the use of Eigen's expression. Unfortunately, the first matrix-vector product - * using Part is very very slow */ - #ifdef EIGEN_NEVER_DEFINED - // matrix - vector product - hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part() - * (h * matA.col(i).end(n-i-1))).lazy(); - // simple axpy - hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) - * matA.col(i).end(n-i-1); - // rank-2 update - //Block B(matA,i+1,i,n-i-1,1); - matA.corner(BottomRight,n-i-1,n-i-1).template part() -= - (matA.col(i).end(n-i-1) * hCoeffs.end(n-i-1).adjoint()).lazy() - + (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy(); - #endif - /* end initial algorithm */ + // hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part() + // * matA.col(i).end(n-i-1)).lazy(); + // TODO map the above code to the function call below: + ei_product_selfadjoint_vector + (n-i-1,matA.corner(BottomRight,n-i-1,n-i-1).data(), matA.stride(), matA.col(i).end(n-i-1).data(), const_cast(hCoeffs.end(n-i-1).data())); - /* If we still want to minimize operation count (i.e., perform operation on the lower part only) - * then we could provide the following algorithm for selfadjoint - vector product. However, a full - * matrix-vector product is still faster (at least for dynamic size, and not too small, did not check - * small matrices). The algo performs block matrix-vector and transposed matrix vector products. */ - #ifdef EIGEN_NEVER_DEFINED - int n4 = (std::max(0,n-4)/4)*4; - hCoeffs.end(n-i-1).setZero(); - for (int b=i+1; b(matA,b+4,b,n-b-4,4) * matA.template block<4,1>(b,i); - // the respective transposed part: - Block(hCoeffs, b, 0, 4,1) += - Block(matA,b+4,b,n-b-4,4).adjoint() * Block(matA,b+4,i,n-b-4,1); - // the 4x4 block diagonal: - Block(hCoeffs, b, 0, 4,1) += - (Block(matA,b,b,4,4).template part() - * (h * Block(matA,b,i,4,1))).lazy(); - } - #endif - // todo: handle the remaining part - /* end optimized selfadjoint - vector product */ + hCoeffs.end(n-i-1) = hCoeffs.end(n-i-1)*h + + (h*ei_conj(h)*Scalar(-0.5)*matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) * + matA.col(i).end(n-i-1); - /* Another interesting note: the above rank-2 update is much slower than the following hand written loop. - * After an analyze of the ASM, it seems GCC (4.2) generate poor code because of the Block. Moreover, - * if we remove the specialization of Block for Matrix then it is even worse, much worse ! */ - #ifdef EIGEN_NEVER_DEFINED + // symmetric rank-2 update for (int j1=i+1; j11) - { - int alignedStart = (starti) + ei_alignmentOffset(&matA.coeffRef(starti,j1), n-starti); - alignedEnd = alignedStart + ((n-alignedStart)/PacketSize)*PacketSize; - - for (int i1=starti; i1