trmm is now working in all storage order configurations

This commit is contained in:
Gael Guennebaud 2009-07-27 10:27:01 +02:00
parent 1d4d9a37fd
commit 6aba84719d
4 changed files with 37 additions and 71 deletions

View File

@ -189,6 +189,7 @@ namespace Eigen {
#include "src/Core/products/SelfadjointRank2Update.h"
#include "src/Core/products/TriangularMatrixVector.h"
#include "src/Core/products/TriangularSolverMatrix.h"
#include "src/Core/products/TriangularMatrixMatrix.h"
#include "src/Core/BandMatrix.h"
} // namespace Eigen

View File

@ -125,7 +125,7 @@ struct ei_gebp_kernel
// loops on each register blocking of lhs/res
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*strideA];
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@ -248,7 +248,7 @@ struct ei_gebp_kernel
}
for(int i=peeled_mc; i<rows; i++)
{
const Scalar* blA = &blockA[i*strideA];
const Scalar* blA = &blockA[i*strideA+offsetA];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@ -285,7 +285,7 @@ struct ei_gebp_kernel
{
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*strideA];
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@ -317,7 +317,7 @@ struct ei_gebp_kernel
}
for(int i=peeled_mc; i<rows; i++)
{
const Scalar* blA = &blockA[i*strideA];
const Scalar* blA = &blockA[i*strideA+offsetA];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
@ -375,17 +375,21 @@ struct ei_gemm_pack_lhs
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
template<typename Scalar, int nr, bool PanelMode>
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
int stride=0, int offset=0)
{
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
bool hasAlpha = alpha != Scalar(1);
int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
if(PanelMode) count += PacketSize * nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
@ -418,10 +422,13 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
count += nr*PacketSize;
}
}
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
if(PanelMode) count += PacketSize * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
if (hasAlpha)
{
@ -439,34 +446,36 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
count += PacketSize;
}
}
if(PanelMode) count += PacketSize * (stride-offset-depth);
}
}
};
// this version is optimized for row major matrices
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
template<typename Scalar, int nr, bool PanelMode>
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
int stride=0, int offset=0)
{
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
bool hasAlpha = alpha != Scalar(1);
int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
if(PanelMode) count += PacketSize * nr * offset;
if (hasAlpha)
{
for(int k=0; k<depth; k++)
{
const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
if (nr==4)
{
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
}
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
count += nr*PacketSize;
}
}
@ -475,26 +484,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
for(int k=0; k<depth; k++)
{
const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
if (nr==4)
{
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
}
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
count += nr*PacketSize;
}
}
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
if(PanelMode) count += PacketSize * offset;
const Scalar* b0 = &rhs[j2];
for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
count += PacketSize;
}
if(PanelMode) count += PacketSize * (stride-offset-depth);
}
}
};

View File

@ -25,9 +25,6 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs_panel;
// if the rhs is row major, we have to evaluate it in a temporary colmajor matrix
template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode>
struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode>
@ -136,7 +133,7 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
// update the respective rows of B from rhs
ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>()
(blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
// GEBP
@ -174,46 +171,4 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
}
};
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs_panel
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset)
{
int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
count += PacketSize * nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
count += nr*PacketSize;
}
// skip what we have after
count += PacketSize * nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
count += PacketSize * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
count += PacketSize;
}
count += PacketSize * (stride-offset-depth);
}
}
};
#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H

View File

@ -32,7 +32,7 @@
template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel;
template<typename Scalar, int nr, int StorageOrder>
template<typename Scalar, int nr, int StorageOrder, bool PanelMode=false>
struct ei_gemm_pack_rhs;
template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false>