trmm is now working in all storage order configurations

This commit is contained in:
Gael Guennebaud 2009-07-27 10:27:01 +02:00
parent 1d4d9a37fd
commit 6aba84719d
4 changed files with 37 additions and 71 deletions

View File

@ -189,6 +189,7 @@ namespace Eigen {
#include "src/Core/products/SelfadjointRank2Update.h" #include "src/Core/products/SelfadjointRank2Update.h"
#include "src/Core/products/TriangularMatrixVector.h" #include "src/Core/products/TriangularMatrixVector.h"
#include "src/Core/products/TriangularSolverMatrix.h" #include "src/Core/products/TriangularSolverMatrix.h"
#include "src/Core/products/TriangularMatrixMatrix.h"
#include "src/Core/BandMatrix.h" #include "src/Core/BandMatrix.h"
} // namespace Eigen } // namespace Eigen

View File

@ -125,7 +125,7 @@ struct ei_gebp_kernel
// loops on each register blocking of lhs/res // loops on each register blocking of lhs/res
for(int i=0; i<peeled_mc; i+=mr) for(int i=0; i<peeled_mc; i+=mr)
{ {
const Scalar* blA = &blockA[i*strideA]; const Scalar* blA = &blockA[i*strideA+offsetA*mr];
#ifdef EIGEN_VECTORIZE_SSE #ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif #endif
@ -248,7 +248,7 @@ struct ei_gebp_kernel
} }
for(int i=peeled_mc; i<rows; i++) for(int i=peeled_mc; i<rows; i++)
{ {
const Scalar* blA = &blockA[i*strideA]; const Scalar* blA = &blockA[i*strideA+offsetA];
#ifdef EIGEN_VECTORIZE_SSE #ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif #endif
@ -285,7 +285,7 @@ struct ei_gebp_kernel
{ {
for(int i=0; i<peeled_mc; i+=mr) for(int i=0; i<peeled_mc; i+=mr)
{ {
const Scalar* blA = &blockA[i*strideA]; const Scalar* blA = &blockA[i*strideA+offsetA*mr];
#ifdef EIGEN_VECTORIZE_SSE #ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif #endif
@ -317,7 +317,7 @@ struct ei_gebp_kernel
} }
for(int i=peeled_mc; i<rows; i++) for(int i=peeled_mc; i<rows; i++)
{ {
const Scalar* blA = &blockA[i*strideA]; const Scalar* blA = &blockA[i*strideA+offsetA];
#ifdef EIGEN_VECTORIZE_SSE #ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif #endif
@ -375,17 +375,21 @@ struct ei_gemm_pack_lhs
// 4 5 6 7 16 17 18 19 25 28 // 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29 // 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . . // . . . . . . . . . .
template<typename Scalar, int nr> template<typename Scalar, int nr, bool PanelMode>
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
{ {
enum { PacketSize = ei_packet_traits<Scalar>::size }; enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
int stride=0, int offset=0)
{ {
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
bool hasAlpha = alpha != Scalar(1); bool hasAlpha = alpha != Scalar(1);
int packet_cols = (cols/nr) * nr; int packet_cols = (cols/nr) * nr;
int count = 0; int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr) for(int j2=0; j2<packet_cols; j2+=nr)
{ {
// skip what we have before
if(PanelMode) count += PacketSize * nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride];
@ -418,10 +422,13 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
count += nr*PacketSize; count += nr*PacketSize;
} }
} }
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
} }
// copy the remaining columns one at a time (nr==1) // copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2) for(int j2=packet_cols; j2<cols; ++j2)
{ {
if(PanelMode) count += PacketSize * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b0 = &rhs[(j2+0)*rhsStride];
if (hasAlpha) if (hasAlpha)
{ {
@ -439,34 +446,36 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
count += PacketSize; count += PacketSize;
} }
} }
if(PanelMode) count += PacketSize * (stride-offset-depth);
} }
} }
}; };
// this version is optimized for row major matrices // this version is optimized for row major matrices
template<typename Scalar, int nr> template<typename Scalar, int nr, bool PanelMode>
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
{ {
enum { PacketSize = ei_packet_traits<Scalar>::size }; enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
int stride=0, int offset=0)
{ {
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
bool hasAlpha = alpha != Scalar(1); bool hasAlpha = alpha != Scalar(1);
int packet_cols = (cols/nr) * nr; int packet_cols = (cols/nr) * nr;
int count = 0; int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr) for(int j2=0; j2<packet_cols; j2+=nr)
{ {
// skip what we have before
if(PanelMode) count += PacketSize * nr * offset;
if (hasAlpha) if (hasAlpha)
{ {
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
const Scalar* b0 = &rhs[k*rhsStride + j2]; const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
if (nr==4) if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
{ if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
}
count += nr*PacketSize; count += nr*PacketSize;
} }
} }
@ -475,26 +484,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
const Scalar* b0 = &rhs[k*rhsStride + j2]; const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
if (nr==4) if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
{ if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
}
count += nr*PacketSize; count += nr*PacketSize;
} }
} }
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
} }
// copy the remaining columns one at a time (nr==1) // copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2) for(int j2=packet_cols; j2<cols; ++j2)
{ {
if(PanelMode) count += PacketSize * offset;
const Scalar* b0 = &rhs[j2]; const Scalar* b0 = &rhs[j2];
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
count += PacketSize; count += PacketSize;
} }
if(PanelMode) count += PacketSize * (stride-offset-depth);
} }
} }
}; };

View File

@ -25,9 +25,6 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs_panel;
// if the rhs is row major, we have to evaluate it in a temporary colmajor matrix // if the rhs is row major, we have to evaluate it in a temporary colmajor matrix
template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode> template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode>
struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode> struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode>
@ -136,7 +133,7 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
int blockBOffset = IsLowerTriangular ? k1 : lengthTarget; int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
// update the respective rows of B from rhs // update the respective rows of B from rhs
ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>() ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>()
(blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset); (blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
// GEBP // GEBP
@ -174,46 +171,4 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
} }
}; };
template<typename Scalar, int nr>
struct ei_gemm_pack_rhs_panel
{
enum { PacketSize = ei_packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset)
{
int packet_cols = (cols/nr) * nr;
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
count += PacketSize * nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
count += nr*PacketSize;
}
// skip what we have after
count += PacketSize * nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
count += PacketSize * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
for(int k=0; k<depth; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
count += PacketSize;
}
count += PacketSize * (stride-offset-depth);
}
}
};
#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H #endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H

View File

@ -32,7 +32,7 @@
template<typename Scalar, int mr, int nr, typename Conj> template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel; struct ei_gebp_kernel;
template<typename Scalar, int nr, int StorageOrder> template<typename Scalar, int nr, int StorageOrder, bool PanelMode=false>
struct ei_gemm_pack_rhs; struct ei_gemm_pack_rhs;
template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false> template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false>