mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
trmm is now working in all storage order configurations
This commit is contained in:
parent
1d4d9a37fd
commit
6aba84719d
@ -189,6 +189,7 @@ namespace Eigen {
|
|||||||
#include "src/Core/products/SelfadjointRank2Update.h"
|
#include "src/Core/products/SelfadjointRank2Update.h"
|
||||||
#include "src/Core/products/TriangularMatrixVector.h"
|
#include "src/Core/products/TriangularMatrixVector.h"
|
||||||
#include "src/Core/products/TriangularSolverMatrix.h"
|
#include "src/Core/products/TriangularSolverMatrix.h"
|
||||||
|
#include "src/Core/products/TriangularMatrixMatrix.h"
|
||||||
#include "src/Core/BandMatrix.h"
|
#include "src/Core/BandMatrix.h"
|
||||||
|
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
@ -125,7 +125,7 @@ struct ei_gebp_kernel
|
|||||||
// loops on each register blocking of lhs/res
|
// loops on each register blocking of lhs/res
|
||||||
for(int i=0; i<peeled_mc; i+=mr)
|
for(int i=0; i<peeled_mc; i+=mr)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA];
|
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
#ifdef EIGEN_VECTORIZE_SSE
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||||
#endif
|
#endif
|
||||||
@ -248,7 +248,7 @@ struct ei_gebp_kernel
|
|||||||
}
|
}
|
||||||
for(int i=peeled_mc; i<rows; i++)
|
for(int i=peeled_mc; i<rows; i++)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA];
|
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
#ifdef EIGEN_VECTORIZE_SSE
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||||
#endif
|
#endif
|
||||||
@ -285,7 +285,7 @@ struct ei_gebp_kernel
|
|||||||
{
|
{
|
||||||
for(int i=0; i<peeled_mc; i+=mr)
|
for(int i=0; i<peeled_mc; i+=mr)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA];
|
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
#ifdef EIGEN_VECTORIZE_SSE
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||||
#endif
|
#endif
|
||||||
@ -317,7 +317,7 @@ struct ei_gebp_kernel
|
|||||||
}
|
}
|
||||||
for(int i=peeled_mc; i<rows; i++)
|
for(int i=peeled_mc; i<rows; i++)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA];
|
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
#ifdef EIGEN_VECTORIZE_SSE
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||||
#endif
|
#endif
|
||||||
@ -375,17 +375,21 @@ struct ei_gemm_pack_lhs
|
|||||||
// 4 5 6 7 16 17 18 19 25 28
|
// 4 5 6 7 16 17 18 19 25 28
|
||||||
// 8 9 10 11 20 21 22 23 26 29
|
// 8 9 10 11 20 21 22 23 26 29
|
||||||
// . . . . . . . . . .
|
// . . . . . . . . . .
|
||||||
template<typename Scalar, int nr>
|
template<typename Scalar, int nr, bool PanelMode>
|
||||||
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
|
||||||
{
|
{
|
||||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
|
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||||
|
int stride=0, int offset=0)
|
||||||
{
|
{
|
||||||
|
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||||
bool hasAlpha = alpha != Scalar(1);
|
bool hasAlpha = alpha != Scalar(1);
|
||||||
int packet_cols = (cols/nr) * nr;
|
int packet_cols = (cols/nr) * nr;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||||
{
|
{
|
||||||
|
// skip what we have before
|
||||||
|
if(PanelMode) count += PacketSize * nr * offset;
|
||||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||||
@ -418,10 +422,13 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
|||||||
count += nr*PacketSize;
|
count += nr*PacketSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// skip what we have after
|
||||||
|
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
// copy the remaining columns one at a time (nr==1)
|
// copy the remaining columns one at a time (nr==1)
|
||||||
for(int j2=packet_cols; j2<cols; ++j2)
|
for(int j2=packet_cols; j2<cols; ++j2)
|
||||||
{
|
{
|
||||||
|
if(PanelMode) count += PacketSize * offset;
|
||||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||||
if (hasAlpha)
|
if (hasAlpha)
|
||||||
{
|
{
|
||||||
@ -439,34 +446,36 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
|||||||
count += PacketSize;
|
count += PacketSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// this version is optimized for row major matrices
|
// this version is optimized for row major matrices
|
||||||
template<typename Scalar, int nr>
|
template<typename Scalar, int nr, bool PanelMode>
|
||||||
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
|
||||||
{
|
{
|
||||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
|
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||||
|
int stride=0, int offset=0)
|
||||||
{
|
{
|
||||||
|
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||||
bool hasAlpha = alpha != Scalar(1);
|
bool hasAlpha = alpha != Scalar(1);
|
||||||
int packet_cols = (cols/nr) * nr;
|
int packet_cols = (cols/nr) * nr;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||||
{
|
{
|
||||||
|
// skip what we have before
|
||||||
|
if(PanelMode) count += PacketSize * nr * offset;
|
||||||
if (hasAlpha)
|
if (hasAlpha)
|
||||||
{
|
{
|
||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
||||||
if (nr==4)
|
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
||||||
{
|
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
||||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
|
||||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
|
||||||
}
|
|
||||||
count += nr*PacketSize;
|
count += nr*PacketSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -475,26 +484,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
|||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
||||||
if (nr==4)
|
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
||||||
{
|
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
||||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
|
||||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
|
||||||
}
|
|
||||||
count += nr*PacketSize;
|
count += nr*PacketSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// skip what we have after
|
||||||
|
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
// copy the remaining columns one at a time (nr==1)
|
// copy the remaining columns one at a time (nr==1)
|
||||||
for(int j2=packet_cols; j2<cols; ++j2)
|
for(int j2=packet_cols; j2<cols; ++j2)
|
||||||
{
|
{
|
||||||
|
if(PanelMode) count += PacketSize * offset;
|
||||||
const Scalar* b0 = &rhs[j2];
|
const Scalar* b0 = &rhs[j2];
|
||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||||
count += PacketSize;
|
count += PacketSize;
|
||||||
}
|
}
|
||||||
|
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -25,9 +25,6 @@
|
|||||||
#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
||||||
#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
||||||
|
|
||||||
template<typename Scalar, int nr>
|
|
||||||
struct ei_gemm_pack_rhs_panel;
|
|
||||||
|
|
||||||
// if the rhs is row major, we have to evaluate it in a temporary colmajor matrix
|
// if the rhs is row major, we have to evaluate it in a temporary colmajor matrix
|
||||||
template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode>
|
template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode>
|
||||||
struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode>
|
struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode>
|
||||||
@ -136,7 +133,7 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
|
|||||||
int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
|
int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
|
||||||
|
|
||||||
// update the respective rows of B from rhs
|
// update the respective rows of B from rhs
|
||||||
ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
|
ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>()
|
||||||
(blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
|
(blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
|
||||||
|
|
||||||
// GEBP
|
// GEBP
|
||||||
@ -174,46 +171,4 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Scalar, int nr>
|
|
||||||
struct ei_gemm_pack_rhs_panel
|
|
||||||
{
|
|
||||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
|
||||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset)
|
|
||||||
{
|
|
||||||
int packet_cols = (cols/nr) * nr;
|
|
||||||
int count = 0;
|
|
||||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
|
||||||
{
|
|
||||||
// skip what we have before
|
|
||||||
count += PacketSize * nr * offset;
|
|
||||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
|
||||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
|
||||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
|
||||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
|
||||||
for(int k=0; k<depth; k++)
|
|
||||||
{
|
|
||||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
|
|
||||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
|
|
||||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
|
|
||||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
|
|
||||||
count += nr*PacketSize;
|
|
||||||
}
|
|
||||||
// skip what we have after
|
|
||||||
count += PacketSize * nr * (stride-offset-depth);
|
|
||||||
}
|
|
||||||
// copy the remaining columns one at a time (nr==1)
|
|
||||||
for(int j2=packet_cols; j2<cols; ++j2)
|
|
||||||
{
|
|
||||||
count += PacketSize * offset;
|
|
||||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
|
||||||
for(int k=0; k<depth; k++)
|
|
||||||
{
|
|
||||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
|
|
||||||
count += PacketSize;
|
|
||||||
}
|
|
||||||
count += PacketSize * (stride-offset-depth);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
template<typename Scalar, int mr, int nr, typename Conj>
|
template<typename Scalar, int mr, int nr, typename Conj>
|
||||||
struct ei_gebp_kernel;
|
struct ei_gebp_kernel;
|
||||||
|
|
||||||
template<typename Scalar, int nr, int StorageOrder>
|
template<typename Scalar, int nr, int StorageOrder, bool PanelMode=false>
|
||||||
struct ei_gemm_pack_rhs;
|
struct ei_gemm_pack_rhs;
|
||||||
|
|
||||||
template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false>
|
template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user