mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Use vectorization when packing row-major rhs matrices. (bug #717)
This commit is contained in:
parent
033ee7f6d9
commit
ce99b502ce
@ -1261,6 +1261,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
|||||||
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
||||||
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
|
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
|
||||||
{
|
{
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
enum { PacketSize = packet_traits<Scalar>::size };
|
enum { PacketSize = packet_traits<Scalar>::size };
|
||||||
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
|
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||||
};
|
};
|
||||||
@ -1282,6 +1283,11 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
|
|||||||
if(PanelMode) count += nr * offset;
|
if(PanelMode) count += nr * offset;
|
||||||
for(Index k=0; k<depth; k++)
|
for(Index k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
|
if (nr == PacketSize) {
|
||||||
|
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
||||||
|
pstoreu(blockB+count, cj.pconj(A));
|
||||||
|
count += PacketSize;
|
||||||
|
} else {
|
||||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||||
blockB[count+0] = cj(b0[0]);
|
blockB[count+0] = cj(b0[0]);
|
||||||
blockB[count+1] = cj(b0[1]);
|
blockB[count+1] = cj(b0[1]);
|
||||||
@ -1289,6 +1295,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
|
|||||||
if(nr==4) blockB[count+3] = cj(b0[3]);
|
if(nr==4) blockB[count+3] = cj(b0[3]);
|
||||||
count += nr;
|
count += nr;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// skip what we have after
|
// skip what we have after
|
||||||
if(PanelMode) count += nr * (stride-offset-depth);
|
if(PanelMode) count += nr * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user