Improved the efficiency if the block-panel matrix multiplication code: the change reduces the pressure on the L1 cache by removing the calls to gebp_traits::unpackRhs(). Instead the packetization of the rhs blocks is done on the fly in gebp_traits::loadRhs(). This adds numerous calls to pset1<ResPacket> (since we're packetizing on the fly in the inner loop) but this is more than compensated by the fact that we're decreasing the memory transfers by a factor RhsPacketSize.

This commit is contained in:
Benoit Steiner 2014-01-02 16:18:32 -08:00
parent 60cd361ebe
commit c8c81c1e74
5 changed files with 38 additions and 95 deletions

View File

@ -10,6 +10,7 @@
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@ -169,7 +170,7 @@ public:
WorkSpaceFactor = nr * RhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize,
LhsProgress = LhsPacketSize, LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize RhsProgress = 1
}; };
typedef typename packet_traits<LhsScalar>::type _LhsPacket; typedef typename packet_traits<LhsScalar>::type _LhsPacket;
@ -187,15 +188,9 @@ public:
p = pset1<ResPacket>(ResScalar(0)); p = pset1<ResPacket>(ResScalar(0));
} }
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{ {
dest = pload<RhsPacket>(b); dest = pset1<RhsPacket>(*b);
} }
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
@ -240,7 +235,7 @@ public:
WorkSpaceFactor = nr*RhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = LhsPacketSize, LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize RhsProgress = 1
}; };
typedef typename packet_traits<LhsScalar>::type _LhsPacket; typedef typename packet_traits<LhsScalar>::type _LhsPacket;
@ -258,15 +253,9 @@ public:
p = pset1<ResPacket>(ResScalar(0)); p = pset1<ResPacket>(ResScalar(0));
} }
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{ {
dest = pload<RhsPacket>(b); dest = pset1<RhsPacket>(*b);
} }
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
@ -320,7 +309,7 @@ public:
WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
LhsProgress = ResPacketSize, LhsProgress = ResPacketSize,
RhsProgress = Vectorizable ? 2*ResPacketSize : 1 RhsProgress = 1
}; };
typedef typename packet_traits<RealScalar>::type RealPacket; typedef typename packet_traits<RealScalar>::type RealPacket;
@ -344,30 +333,15 @@ public:
p.second = pset1<RealPacket>(RealScalar(0)); p.second = pset1<RealPacket>(RealScalar(0));
} }
/* Unpack the rhs coeff such that each complex coefficient is spread into EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
* two packects containing respectively the real and imaginary coefficient
* duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
*/
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
{ {
for(DenseIndex k=0; k<n; k++) dest = pset1<ResPacket>(*b);
{
if(Vectorizable)
{
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
} }
else
b[k] = rhs[k];
}
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
{ {
dest.first = pload<RealPacket>((const RealScalar*)b); dest.first = pset1<RealPacket>(real(*b));
dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); dest.second = pset1<RealPacket>(imag(*b));
} }
// nothing special here // nothing special here
@ -445,7 +419,7 @@ public:
WorkSpaceFactor = nr*RhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = ResPacketSize, LhsProgress = ResPacketSize,
RhsProgress = ResPacketSize RhsProgress = 1
}; };
typedef typename packet_traits<LhsScalar>::type _LhsPacket; typedef typename packet_traits<LhsScalar>::type _LhsPacket;
@ -463,15 +437,9 @@ public:
p = pset1<ResPacket>(ResScalar(0)); p = pset1<ResPacket>(ResScalar(0));
} }
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{ {
dest = pload<RhsPacket>(b); dest = pset1<RhsPacket>(*b);
} }
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
@ -529,14 +497,14 @@ struct gebp_kernel
EIGEN_DONT_INLINE EIGEN_DONT_INLINE
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
}; };
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE EIGEN_DONT_INLINE
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) Index strideA, Index strideB, Index offsetA, Index offsetB)
{ {
Traits traits; Traits traits;
@ -550,14 +518,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
const Index peeled_kc = (depth/4)*4; const Index peeled_kc = (depth/4)*4;
if(unpackedB==0)
unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
// loops on each micro vertical panel of rhs (depth x nr) // loops on each micro vertical panel of rhs (depth x nr)
for(Index j2=0; j2<packet_cols; j2+=nr) for(Index j2=0; j2<packet_cols; j2+=nr)
{ {
traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
// loops on each largest micro horizontal panel of lhs (mr x depth) // loops on each largest micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely // => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers. // stored into mr/packet_size x nr registers.
@ -590,7 +553,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
// performs "inner" product // performs "inner" product
// TODO let's check wether the folowing peeled loop could not be // TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other // optimized via optimal prefetching from one loop to the other
const RhsScalar* blB = unpackedB; const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
for(Index k=0; k<peeled_kc; k+=4) for(Index k=0; k<peeled_kc; k+=4)
{ {
if(nr==2) if(nr==2)
@ -819,7 +782,7 @@ EIGEN_ASM_COMMENT("mybegin4");
if(nr==4) traits.initAcc(C3); if(nr==4) traits.initAcc(C3);
// performs "inner" product // performs "inner" product
const RhsScalar* blB = unpackedB; const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
for(Index k=0; k<peeled_kc; k+=4) for(Index k=0; k<peeled_kc; k+=4)
{ {
if(nr==2) if(nr==2)
@ -1006,9 +969,6 @@ EIGEN_ASM_COMMENT("mybegin4");
// => do the same but with nr==1 // => do the same but with nr==1
for(Index j2=packet_cols; j2<cols; j2++) for(Index j2=packet_cols; j2<cols; j2++)
{ {
// unpack B
traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
for(Index i=0; i<peeled_mc; i+=mr) for(Index i=0; i<peeled_mc; i+=mr)
{ {
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
@ -1021,7 +981,7 @@ EIGEN_ASM_COMMENT("mybegin4");
traits.initAcc(C0); traits.initAcc(C0);
traits.initAcc(C4); traits.initAcc(C4);
const RhsScalar* blB = unpackedB; const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++) for(Index k=0; k<depth; k++)
{ {
LhsPacket A0, A1; LhsPacket A0, A1;
@ -1060,7 +1020,7 @@ EIGEN_ASM_COMMENT("mybegin4");
AccPacket C0; AccPacket C0;
traits.initAcc(C0); traits.initAcc(C0);
const RhsScalar* blB = unpackedB; const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++) for(Index k=0; k<depth; k++)
{ {
LhsPacket A0; LhsPacket A0;

View File

@ -81,9 +81,7 @@ static void run(Index rows, Index cols, Index depth,
Index threads = omp_get_num_threads(); Index threads = omp_get_num_threads();
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
RhsScalar* blockB = blocking.blockB(); RhsScalar* blockB = blocking.blockB();
eigen_internal_assert(blockB!=0); eigen_internal_assert(blockB!=0);
@ -122,7 +120,7 @@ static void run(Index rows, Index cols, Index depth,
if(shift>0) if(shift>0)
while(info[j].sync!=k) {} while(info[j].sync!=k) {}
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0);
} }
// Then keep going as usual with the remaining A' // Then keep going as usual with the remaining A'
@ -134,7 +132,7 @@ static void run(Index rows, Index cols, Index depth,
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
// C_i += A' * B' // C_i += A' * B'
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0);
} }
// Release all the sub blocks B'_j of B' for the current thread, // Release all the sub blocks B'_j of B' for the current thread,
@ -152,11 +150,9 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version! // this is the sequential version!
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1) // (==GEMM_VAR1)
@ -182,7 +178,7 @@ static void run(Index rows, Index cols, Index depth,
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
// Everything is packed, we can now call the block * panel kernel: // Everything is packed, we can now call the block * panel kernel:
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
} }
} }
} }

View File

@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
if(mc > Traits::nr) if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr; mc = (mc/Traits::nr)*Traits::nr;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
RhsScalar* blockB = allocatedBlockB + sizeW;
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
// 3 - after the diagonal => processed with gebp or skipped // 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower) if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0, allocatedBlockB); -1, -1, 0, 0);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper) if (UpLo==Upper)
{ {
Index j2 = i2+actual_mc; Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0, allocatedBlockB); -1, -1, 0, 0);
} }
} }
} }
@ -136,7 +133,7 @@ struct tribb_kernel
enum { enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
}; };
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{ {
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
@ -150,7 +147,7 @@ struct tribb_kernel
if(UpLo==Upper) if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
// selfadjoint micro block // selfadjoint micro block
{ {
@ -158,7 +155,7 @@ struct tribb_kernel
buffer.setZero(); buffer.setZero();
// 1 - apply the kernel on the temporary buffer // 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
// 2 - triangular accumulation // 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1) for(Index j1=0; j1<actualBlockSize; ++j1)
{ {
@ -173,7 +170,7 @@ struct tribb_kernel
{ {
Index i = j+actualBlockSize; Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
} }
} }
} }

View File

@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel // GEBP with remaining micro panel
if (lengthTarget>0) if (lengthTarget>0)
@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
} }
} }
} }
@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
alpha, alpha,
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
blockOffset, blockOffset,// offsets blockOffset, blockOffset);// offsets
blockW); // workspace
} }
} }
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
blockA, geb, actual_mc, actual_kc, rs, blockA, geb, actual_mc, actual_kc, rs,
alpha, alpha,
-1, -1, 0, 0, blockW); -1, -1, 0, 0);
} }
} }
} }

View File

@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
conj_if<Conjugate> conj; conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
{ {
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW); gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
} }
} }
} }
@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*size; std::size_t sizeB = kc*size;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
conj_if<Conjugate> conj; conj_if<Conjugate> conj;
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
Scalar(-1), Scalar(-1),
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
panelOffset, panelOffset, // offsets panelOffset, panelOffset); // offsets
blockW); // workspace
} }
// unblocked triangular solve // unblocked triangular solve
@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (rs>0) if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1), actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0, blockW); -1, -1, 0, 0);
} }
} }
} }