mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-16 11:23:14 +08:00
improve performance of trsm
This commit is contained in:
parent
c6ad2deead
commit
931814d7c0
@ -75,6 +75,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
Scalar* blockW = allocatedBlockB;
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
@ -92,16 +93,19 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
|
||||
// A11 (the triangular part) and A21 the remaining rectangular part.
|
||||
// Then the high level algorithm is:
|
||||
// - B = R1 => general block copy (done during the next step)
|
||||
// - R1 = L1^-1 B => tricky part
|
||||
// - R1 = A11^-1 B => tricky part
|
||||
// - update B from the new R1 => actually this has to be performed continuously during the above step
|
||||
// - R2 = L2 * B => GEPP
|
||||
// - R2 -= A21 * B => GEPP
|
||||
|
||||
// The tricky part: compute R1 = L1^-1 B while updating B from R1
|
||||
// The idea is to split L1 into multiple small vertical panels.
|
||||
// Each panel can be split into a small triangular part A1 which is processed without optimization,
|
||||
// and the remaining small part A2 which is processed using gebp with appropriate block strides
|
||||
// The tricky part: compute R1 = A11^-1 B while updating B from R1
|
||||
// The idea is to split A11 into multiple small vertical panels.
|
||||
// Each panel can be split into a small triangular part T1k which is processed without optimization,
|
||||
// and the remaining small part T2k which is processed using gebp with appropriate block strides
|
||||
Index subcols = (kc/Traits::nr)*Traits::nr; // TODO kc might not be an ideal choice here
|
||||
for(Index j2=0; j2<cols; j2+=subcols)
|
||||
{
|
||||
// for each small vertical panels of lhs
|
||||
Index actual_cols = std::min(cols-j2,subcols);
|
||||
// for each small vertical panels [T1k^T, T2k^T]^T of lhs
|
||||
for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
|
||||
{
|
||||
Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
|
||||
@ -114,7 +118,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
|
||||
Index rs = actualPanelWidth - k - 1; // remaining size
|
||||
|
||||
Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
|
||||
for (Index j=0; j<cols; ++j)
|
||||
for (Index j=j2; j<j2+actual_cols; ++j)
|
||||
{
|
||||
if (TriStorageOrder==RowMajor)
|
||||
{
|
||||
@ -143,7 +147,7 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
|
||||
Index blockBOffset = IsLower ? k1 : lengthTarget;
|
||||
|
||||
// update the respective rows of B from other
|
||||
pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset);
|
||||
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
|
||||
|
||||
// GEBP
|
||||
if (lengthTarget>0)
|
||||
@ -152,13 +156,13 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
|
||||
|
||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1),
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// R2 = A2 * B => GEPP
|
||||
|
||||
// R2 -= A21 * B => GEPP
|
||||
{
|
||||
Index start = IsLower ? k2+kc : 0;
|
||||
Index end = IsLower ? size : k2-kc;
|
||||
|
Loading…
x
Reference in New Issue
Block a user