mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-20 22:52:51 +08:00
ok, now trsm works very well for upper triangular matrices
TODO: link it with the meta triangular_solve_selector and handle the case where the rhs is row major by copying it to a col-major temporary + handle right solving: X = B * M^-1
This commit is contained in:
parent
f4112dcff3
commit
282e18da49
@ -81,10 +81,8 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
|
|||||||
const Scalar* _lhs, int lhsStride,
|
const Scalar* _lhs, int lhsStride,
|
||||||
Scalar* _rhs, int rhsStride)
|
Scalar* _rhs, int rhsStride)
|
||||||
{
|
{
|
||||||
Map<Matrix<Scalar,Dynamic,Dynamic,LhsStorageOrder> > lhs(_lhs, size, size);
|
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||||
Map<Matrix<Scalar,Dynamic,Dynamic,RhsStorageOrder> > rhs(_rhs, size, cols);
|
ei_blas_data_mapper <Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||||
//ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
|
||||||
//ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
|
||||||
|
|
||||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||||
enum {
|
enum {
|
||||||
@ -96,14 +94,16 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
|
|||||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||||
|
|
||||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||||
// Scalar* blockB = new Scalar[10*kc*cols*Blocking::PacketSize];
|
|
||||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||||
|
|
||||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,false> > gebp_kernel;
|
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,false> > gebp_kernel;
|
||||||
|
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder> pack_lhs;
|
||||||
|
|
||||||
for(int k2=0; k2<size; k2+=kc)
|
for(int k2=IsLowerTriangular ? 0 : size;
|
||||||
|
IsLowerTriangular ? k2<size : k2>0;
|
||||||
|
IsLowerTriangular ? k2+=kc : k2-=kc)
|
||||||
{
|
{
|
||||||
const int actual_kc = std::min(k2+kc,size)-k2;
|
const int actual_kc = std::min(IsLowerTriangular ? size-k2 : k2, kc);
|
||||||
|
|
||||||
// We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
|
// We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
|
||||||
// and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
|
// and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
|
||||||
@ -114,7 +114,7 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
|
|||||||
// - update B from the new R1 => actually this has to be performed continuously during the above step
|
// - update B from the new R1 => actually this has to be performed continuously during the above step
|
||||||
// - R2 = L2 * B => GEPP
|
// - R2 = L2 * B => GEPP
|
||||||
|
|
||||||
// The tricky part: R1 = L1^-1 B while updating B from R1
|
// The tricky part: compute R1 = L1^-1 B while updating B from R1
|
||||||
// The idea is to split L1 into multiple small vertical panels.
|
// The idea is to split L1 into multiple small vertical panels.
|
||||||
// Each panel can be split into a small triangular part A1 which is processed without optimization,
|
// Each panel can be split into a small triangular part A1 which is processed without optimization,
|
||||||
// and the remaining small part A2 which is processed using gebp with appropriate block strides
|
// and the remaining small part A2 which is processed using gebp with appropriate block strides
|
||||||
@ -122,76 +122,80 @@ struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
|
|||||||
// for each small vertical panels of lhs
|
// for each small vertical panels of lhs
|
||||||
for (int k1=0; k1<actual_kc; k1+=SmallPanelWidth)
|
for (int k1=0; k1<actual_kc; k1+=SmallPanelWidth)
|
||||||
{
|
{
|
||||||
int actualPanelWidth = std::min<int>(SmallPanelWidth,actual_kc-k1);
|
int actualPanelWidth = std::min<int>(actual_kc-k1, SmallPanelWidth);
|
||||||
// tr solve
|
// tr solve
|
||||||
for (int k=0; k<actualPanelWidth; ++k)
|
for (int k=0; k<actualPanelWidth; ++k)
|
||||||
{
|
{
|
||||||
int i = k2+k1+k;
|
// TODO write a small kernel handling this (can be shared with trsv)
|
||||||
|
int i = IsLowerTriangular ? k2+k1+k : k2-k1-k-1;
|
||||||
|
int s = IsLowerTriangular ? k2+k1 : i+1;
|
||||||
int rs = actualPanelWidth - k - 1; // remaining size
|
int rs = actualPanelWidth - k - 1; // remaining size
|
||||||
|
|
||||||
Scalar a = (Mode & UnitDiagBit) ? Scalar(1) : Scalar(1)/lhs(i,i);
|
Scalar a = (Mode & UnitDiagBit) ? Scalar(1) : Scalar(1)/lhs(i,i);
|
||||||
for (int j=0; j<cols; ++j)
|
for (int j=0; j<cols; ++j)
|
||||||
{
|
{
|
||||||
|
|
||||||
if (LhsStorageOrder==RowMajor)
|
if (LhsStorageOrder==RowMajor)
|
||||||
{
|
{
|
||||||
Scalar b = 0;
|
Scalar b = 0;
|
||||||
Scalar* r = &rhs.coeffRef(k2+k1,j);
|
const Scalar* l = &lhs(i,s);
|
||||||
const Scalar* l = &lhs.coeff(i,k2+k1);
|
Scalar* r = &rhs(s,j);
|
||||||
for (int i3=0; i3<k; ++i3)
|
for (int i3=0; i3<k; ++i3)
|
||||||
b += l[i3] * r[i3];
|
b += l[i3] * r[i3];
|
||||||
|
|
||||||
rhs.coeffRef(i,j) = (rhs.coeffRef(i,j) - b)*a;
|
rhs(i,j) = (rhs(i,j) - b)*a;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Scalar b = (rhs.coeffRef(i,j) *= a);
|
int s = IsLowerTriangular ? i+1 : i-rs;
|
||||||
Scalar* r = &rhs.coeffRef(i+1,j);
|
Scalar b = (rhs(i,j) *= a);
|
||||||
const Scalar* l = &lhs.coeff(i+1,i);
|
Scalar* r = &rhs(s,j);
|
||||||
|
const Scalar* l = &lhs(s,i);
|
||||||
for (int i3=0;i3<rs;++i3)
|
for (int i3=0;i3<rs;++i3)
|
||||||
r[i3] -= b * l[i3];
|
r[i3] -= b * l[i3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// for (int j=0; j<cols; ++j)
|
|
||||||
// lhs.block(k2+k1,k2+k1,actualPanelWidth,actualPanelWidth).template triangularView<LowerTriangular>()
|
int lengthTarget = actual_kc-k1-actualPanelWidth;
|
||||||
// .solveInPlace(rhs.col(j).segment(k2+k1,actualPanelWidth));
|
int startBlock = IsLowerTriangular ? k2+k1 : k2-k1-actualPanelWidth;
|
||||||
// lhs.block(k2+k1,k2+k1,actualPanelWidth,actualPanelWidth).template triangularView<LowerTriangular>()
|
int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
|
||||||
// .solveInPlace(rhs.block(k2+k1,0,actualPanelWidth,cols));
|
|
||||||
|
|
||||||
// update the respective rows of B from rhs
|
// update the respective rows of B from rhs
|
||||||
ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
|
ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
|
||||||
(blockB, _rhs+k2+k1, rhsStride, -1, actualPanelWidth, cols, actual_kc, k1);
|
(blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
|
||||||
|
|
||||||
// GEBP
|
// GEBP
|
||||||
int i = k1+actualPanelWidth;
|
if (lengthTarget>0)
|
||||||
int rs = actual_kc-i;
|
|
||||||
|
|
||||||
if (rs>0)
|
|
||||||
{
|
{
|
||||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
int startTarget = IsLowerTriangular ? k2+k1+actualPanelWidth : k2-actual_kc;
|
||||||
(blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs);
|
|
||||||
|
|
||||||
gebp_kernel(_rhs+i+k2, rhsStride,
|
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||||
blockA, blockB, rs, actualPanelWidth, cols, actualPanelWidth, actual_kc, 0, k1*Blocking::PacketSize);
|
|
||||||
|
gebp_kernel(_rhs+startTarget, rhsStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
|
||||||
|
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// - R2 = A2 * B => GEPP
|
// R2 = A2 * B => GEPP
|
||||||
for(int i2=k2+kc; i2<size; i2+=mc)
|
|
||||||
{
|
{
|
||||||
const int actual_mc = std::min(i2+mc,size)-i2;
|
int start = IsLowerTriangular ? k2+kc : 0;
|
||||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
int end = IsLowerTriangular ? size : k2-kc;
|
||||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
for(int i2=start; i2<end; i2+=mc)
|
||||||
|
{
|
||||||
|
const int actual_mc = std::min(mc,end-i2);
|
||||||
|
if (actual_mc>0)
|
||||||
|
{
|
||||||
|
pack_lhs(blockA, &lhs(i2, IsLowerTriangular ? k2 : k2-kc), lhsStride, actual_kc, actual_mc);
|
||||||
|
|
||||||
gebp_kernel(_rhs+i2, rhsStride, blockA, blockB, actual_mc, actual_kc, cols);
|
gebp_kernel(_rhs+i2, rhsStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||||
// delete[] blockB;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user