mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-07 03:39:04 +08:00
Generalized the gebp apis
This commit is contained in:
parent
8b2afe33a1
commit
b7271dffb5
@ -667,7 +667,7 @@ protected:
|
||||
* |real |cplx | no vectorization yet, would require to pack A with duplication
|
||||
* |cplx |real | easy vectorization
|
||||
*/
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
struct gebp_kernel
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
|
||||
@ -676,14 +676,15 @@ struct gebp_kernel
|
||||
typedef typename Traits::RhsPacket RhsPacket;
|
||||
typedef typename Traits::ResPacket ResPacket;
|
||||
typedef typename Traits::AccPacket AccPacket;
|
||||
|
||||
|
||||
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
|
||||
typedef typename SwappedTraits::ResScalar SResScalar;
|
||||
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
||||
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
||||
typedef typename SwappedTraits::ResPacket SResPacket;
|
||||
typedef typename SwappedTraits::AccPacket SAccPacket;
|
||||
|
||||
|
||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
||||
|
||||
enum {
|
||||
Vectorizable = Traits::Vectorizable,
|
||||
@ -693,14 +694,16 @@ struct gebp_kernel
|
||||
};
|
||||
|
||||
EIGEN_DONT_INLINE
|
||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
|
||||
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha,
|
||||
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
||||
};
|
||||
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
EIGEN_DONT_INLINE
|
||||
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
|
||||
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha,
|
||||
Index strideA, Index strideB, Index offsetA, Index offsetB)
|
||||
{
|
||||
Traits traits;
|
||||
@ -743,15 +746,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
|
||||
traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
ResScalar* r1 = &res[(j2+1)*resStride + i];
|
||||
ResScalar* r2 = &res[(j2+2)*resStride + i];
|
||||
ResScalar* r3 = &res[(j2+3)*resStride + i];
|
||||
|
||||
internal::prefetch(r0);
|
||||
internal::prefetch(r1);
|
||||
internal::prefetch(r2);
|
||||
internal::prefetch(r3);
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
||||
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
||||
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
||||
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
||||
|
||||
r0.prefetch(0);
|
||||
r1.prefetch(0);
|
||||
r2.prefetch(0);
|
||||
r3.prefetch(0);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
||||
@ -813,48 +816,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
|
||||
ResPacket R0, R1, R2;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
|
||||
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
traits.acc(C4, alphav, R1);
|
||||
traits.acc(C8, alphav, R2);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r0+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r0+2*Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
|
||||
traits.acc(C1, alphav, R0);
|
||||
traits.acc(C5, alphav, R1);
|
||||
traits.acc(C9, alphav, R2);
|
||||
pstoreu(r1+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r1+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r1+2*Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
|
||||
r1.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r1.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r1.storePacket(2 * Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
|
||||
traits.acc(C2, alphav, R0);
|
||||
traits.acc(C6, alphav, R1);
|
||||
traits.acc(C10, alphav, R2);
|
||||
pstoreu(r2+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r2+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r2+2*Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
|
||||
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r2.storePacket(2 * Traits::ResPacketSize, R2);
|
||||
|
||||
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
|
||||
traits.acc(C3, alphav, R0);
|
||||
traits.acc(C7, alphav, R1);
|
||||
traits.acc(C11, alphav, R2);
|
||||
pstoreu(r3+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r3+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r3+2*Traits::ResPacketSize, R2);
|
||||
r3.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r3.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r3.storePacket(2 * Traits::ResPacketSize, R2);
|
||||
}
|
||||
|
||||
|
||||
// Deal with remaining columns of the rhs
|
||||
for(Index j2=packet_cols4; j2<cols; j2++)
|
||||
{
|
||||
@ -868,7 +871,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.initAcc(C4);
|
||||
traits.initAcc(C8);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2);
|
||||
r0.prefetch(0);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
||||
@ -912,19 +916,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
ResPacket R0, R1, R2;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
traits.acc(C4, alphav, R1);
|
||||
traits.acc(C8 , alphav, R2);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r0+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r0+2*Traits::ResPacketSize, R2);
|
||||
traits.acc(C8, alphav, R2);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------- Process 2 * LhsProgress rows at once ----------
|
||||
if(mr>=2*Traits::LhsProgress)
|
||||
{
|
||||
@ -946,15 +950,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
|
||||
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
ResScalar* r1 = &res[(j2+1)*resStride + i];
|
||||
ResScalar* r2 = &res[(j2+2)*resStride + i];
|
||||
ResScalar* r3 = &res[(j2+3)*resStride + i];
|
||||
|
||||
internal::prefetch(r0+prefetch_res_offset);
|
||||
internal::prefetch(r1+prefetch_res_offset);
|
||||
internal::prefetch(r2+prefetch_res_offset);
|
||||
internal::prefetch(r3+prefetch_res_offset);
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
||||
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
||||
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
||||
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
||||
|
||||
r0.prefetch(prefetch_res_offset);
|
||||
r1.prefetch(prefetch_res_offset);
|
||||
r2.prefetch(prefetch_res_offset);
|
||||
r3.prefetch(prefetch_res_offset);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
||||
@ -978,7 +982,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.madd(A1, B2, C6, B2); \
|
||||
traits.madd(A0, B3, C3, T0); \
|
||||
traits.madd(A1, B3, C7, B3)
|
||||
|
||||
|
||||
internal::prefetch(blB+(48+0));
|
||||
EIGEN_GEBGP_ONESTEP(0);
|
||||
EIGEN_GEBGP_ONESTEP(1);
|
||||
@ -1002,37 +1006,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
blA += 2*Traits::LhsProgress;
|
||||
}
|
||||
#undef EIGEN_GEBGP_ONESTEP
|
||||
|
||||
|
||||
ResPacket R0, R1, R2, R3;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
|
||||
R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
|
||||
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
|
||||
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
traits.acc(C4, alphav, R1);
|
||||
traits.acc(C1, alphav, R2);
|
||||
traits.acc(C5, alphav, R3);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r0+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r1+0*Traits::ResPacketSize, R2);
|
||||
pstoreu(r1+1*Traits::ResPacketSize, R3);
|
||||
|
||||
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
|
||||
R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
|
||||
R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r1.storePacket(0 * Traits::ResPacketSize, R2);
|
||||
r1.storePacket(1 * Traits::ResPacketSize, R3);
|
||||
|
||||
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
||||
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
|
||||
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
|
||||
traits.acc(C2, alphav, R0);
|
||||
traits.acc(C6, alphav, R1);
|
||||
traits.acc(C3, alphav, R2);
|
||||
traits.acc(C7, alphav, R3);
|
||||
pstoreu(r2+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r2+1*Traits::ResPacketSize, R1);
|
||||
pstoreu(r3+0*Traits::ResPacketSize, R2);
|
||||
pstoreu(r3+1*Traits::ResPacketSize, R3);
|
||||
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
r3.storePacket(0 * Traits::ResPacketSize, R2);
|
||||
r3.storePacket(1 * Traits::ResPacketSize, R3);
|
||||
}
|
||||
|
||||
|
||||
// Deal with remaining columns of the rhs
|
||||
for(Index j2=packet_cols4; j2<cols; j2++)
|
||||
{
|
||||
@ -1045,8 +1049,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.initAcc(C0);
|
||||
traits.initAcc(C4);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
internal::prefetch(r0+prefetch_res_offset);
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2);
|
||||
r0.prefetch(prefetch_res_offset);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
||||
@ -1089,12 +1093,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
ResPacket R0, R1;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
traits.acc(C4, alphav, R1);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r0+1*Traits::ResPacketSize, R1);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1120,15 +1124,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
traits.initAcc(C2);
|
||||
traits.initAcc(C3);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
ResScalar* r1 = &res[(j2+1)*resStride + i];
|
||||
ResScalar* r2 = &res[(j2+2)*resStride + i];
|
||||
ResScalar* r3 = &res[(j2+3)*resStride + i];
|
||||
|
||||
internal::prefetch(r0+prefetch_res_offset);
|
||||
internal::prefetch(r1+prefetch_res_offset);
|
||||
internal::prefetch(r2+prefetch_res_offset);
|
||||
internal::prefetch(r3+prefetch_res_offset);
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
||||
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
||||
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
||||
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
||||
|
||||
r0.prefetch(prefetch_res_offset);
|
||||
r1.prefetch(prefetch_res_offset);
|
||||
r2.prefetch(prefetch_res_offset);
|
||||
r3.prefetch(prefetch_res_offset);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
||||
@ -1171,25 +1175,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
blA += 1*LhsProgress;
|
||||
}
|
||||
#undef EIGEN_GEBGP_ONESTEP
|
||||
|
||||
|
||||
ResPacket R0, R1;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
|
||||
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
traits.acc(C1, alphav, R1);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r1+0*Traits::ResPacketSize, R1);
|
||||
|
||||
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
|
||||
R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r1.storePacket(0 * Traits::ResPacketSize, R1);
|
||||
|
||||
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
||||
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
|
||||
traits.acc(C2, alphav, R0);
|
||||
traits.acc(C3, alphav, R1);
|
||||
pstoreu(r2+0*Traits::ResPacketSize, R0);
|
||||
pstoreu(r3+0*Traits::ResPacketSize, R1);
|
||||
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
r3.storePacket(0 * Traits::ResPacketSize, R1);
|
||||
}
|
||||
|
||||
|
||||
// Deal with remaining columns of the rhs
|
||||
for(Index j2=packet_cols4; j2<cols; j2++)
|
||||
{
|
||||
@ -1201,7 +1205,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
AccPacket C0;
|
||||
traits.initAcc(C0);
|
||||
|
||||
ResScalar* r0 = &res[(j2+0)*resStride + i];
|
||||
LinearMapper r0 = res.getLinearMapper(i, j2);
|
||||
|
||||
// performs "inner" products
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
||||
@ -1241,9 +1245,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
#undef EIGEN_GEBGP_ONESTEP
|
||||
ResPacket R0;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
|
||||
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
||||
traits.acc(C0, alphav, R0);
|
||||
pstoreu(r0+0*Traits::ResPacketSize, R0);
|
||||
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1259,7 +1263,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
||||
prefetch(&blA[0]);
|
||||
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
||||
|
||||
|
||||
if( (SwappedTraits::LhsProgress % 4)==0 )
|
||||
{
|
||||
// NOTE The following piece of code wont work for 512 bit registers
|
||||
@ -1268,32 +1272,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
straits.initAcc(C1);
|
||||
straits.initAcc(C2);
|
||||
straits.initAcc(C3);
|
||||
|
||||
|
||||
const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
|
||||
const Index endk = (depth/spk)*spk;
|
||||
const Index endk4 = (depth/(spk*4))*(spk*4);
|
||||
|
||||
|
||||
Index k=0;
|
||||
for(; k<endk4; k+=4*spk)
|
||||
{
|
||||
SLhsPacket A0,A1;
|
||||
SRhsPacket B_0,B_1;
|
||||
|
||||
|
||||
straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
|
||||
straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
|
||||
|
||||
|
||||
straits.loadRhsQuad(blA+0*spk, B_0);
|
||||
straits.loadRhsQuad(blA+1*spk, B_1);
|
||||
straits.madd(A0,B_0,C0,B_0);
|
||||
straits.madd(A1,B_1,C1,B_1);
|
||||
|
||||
|
||||
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
|
||||
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
|
||||
straits.loadRhsQuad(blA+2*spk, B_0);
|
||||
straits.loadRhsQuad(blA+3*spk, B_1);
|
||||
straits.madd(A0,B_0,C2,B_0);
|
||||
straits.madd(A1,B_1,C3,B_1);
|
||||
|
||||
|
||||
blB += 4*SwappedTraits::LhsProgress;
|
||||
blA += 4*spk;
|
||||
}
|
||||
@ -1302,11 +1306,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
{
|
||||
SLhsPacket A0;
|
||||
SRhsPacket B_0;
|
||||
|
||||
|
||||
straits.loadLhsUnaligned(blB, A0);
|
||||
straits.loadRhsQuad(blA, B_0);
|
||||
straits.madd(A0,B_0,C0,B_0);
|
||||
|
||||
|
||||
blB += SwappedTraits::LhsProgress;
|
||||
blA += spk;
|
||||
}
|
||||
@ -1317,10 +1321,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
|
||||
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
|
||||
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
|
||||
|
||||
SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
|
||||
|
||||
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
||||
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
|
||||
|
||||
|
||||
if(depth-endk>0)
|
||||
{
|
||||
// We have to handle the last row of the rhs which corresponds to a half-packet
|
||||
@ -1336,14 +1340,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
{
|
||||
straits.acc(predux4(C0), alphav, R);
|
||||
}
|
||||
pscatter(&res[j2*resStride + i], R, resStride);
|
||||
res.scatterPacket(i, j2, R);
|
||||
}
|
||||
else
|
||||
{
|
||||
SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
|
||||
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
||||
SResPacket alphav = pset1<SResPacket>(alpha);
|
||||
straits.acc(C0, alphav, R);
|
||||
pscatter(&res[j2*resStride + i], R, resStride);
|
||||
res.scatterPacket(i, j2, R);
|
||||
}
|
||||
}
|
||||
else // scalar path
|
||||
@ -1355,25 +1359,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
{
|
||||
LhsScalar A0;
|
||||
RhsScalar B_0, B_1;
|
||||
|
||||
|
||||
A0 = blA[k];
|
||||
|
||||
|
||||
B_0 = blB[0];
|
||||
B_1 = blB[1];
|
||||
MADD(cj,A0,B_0,C0, B_0);
|
||||
MADD(cj,A0,B_1,C1, B_1);
|
||||
|
||||
|
||||
B_0 = blB[2];
|
||||
B_1 = blB[3];
|
||||
MADD(cj,A0,B_0,C2, B_0);
|
||||
MADD(cj,A0,B_1,C3, B_1);
|
||||
|
||||
|
||||
blB += 4;
|
||||
}
|
||||
res[(j2+0)*resStride + i] += alpha*C0;
|
||||
res[(j2+1)*resStride + i] += alpha*C1;
|
||||
res[(j2+2)*resStride + i] += alpha*C2;
|
||||
res[(j2+3)*resStride + i] += alpha*C3;
|
||||
res(i, j2 + 0) += alpha * C0;
|
||||
res(i, j2 + 1) += alpha * C1;
|
||||
res(i, j2 + 2) += alpha * C2;
|
||||
res(i, j2 + 3) += alpha * C3;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1394,7 +1398,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
RhsScalar B_0 = blB[k];
|
||||
MADD(cj, A0, B_0, C0, B_0);
|
||||
}
|
||||
res[(j2+0)*resStride + i] += alpha*C0;
|
||||
res(i, j2) += alpha * C0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1417,15 +1421,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
|
||||
//
|
||||
// 32 33 34 35 ...
|
||||
// 36 36 38 39 ...
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
||||
{
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
||||
{
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
enum { PacketSize = packet_traits<Scalar>::size };
|
||||
@ -1436,30 +1441,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
|
||||
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
|
||||
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||
const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
|
||||
Index count = 0;
|
||||
|
||||
|
||||
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
||||
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
||||
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
||||
const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
|
||||
: Pack2>1 ? (rows/Pack2)*Pack2 : 0;
|
||||
|
||||
|
||||
Index i=0;
|
||||
|
||||
|
||||
// Pack 3 packets
|
||||
if(Pack1>=3*PacketSize)
|
||||
{
|
||||
for(; i<peeled_mc3; i+=3*PacketSize)
|
||||
{
|
||||
if(PanelMode) count += (3*PacketSize) * offset;
|
||||
|
||||
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
Packet A, B, C;
|
||||
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
|
||||
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
|
||||
C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
|
||||
A = lhs.loadPacket(i+0*PacketSize, k);
|
||||
B = lhs.loadPacket(i+1*PacketSize, k);
|
||||
C = lhs.loadPacket(i+2*PacketSize, k);
|
||||
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
||||
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
||||
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
|
||||
@ -1473,12 +1477,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
|
||||
for(; i<peeled_mc2; i+=2*PacketSize)
|
||||
{
|
||||
if(PanelMode) count += (2*PacketSize) * offset;
|
||||
|
||||
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
Packet A, B;
|
||||
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
|
||||
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
|
||||
A = lhs.loadPacket(i+0*PacketSize, k);
|
||||
B = lhs.loadPacket(i+1*PacketSize, k);
|
||||
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
||||
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
||||
}
|
||||
@ -1491,11 +1495,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
|
||||
for(; i<peeled_mc1; i+=1*PacketSize)
|
||||
{
|
||||
if(PanelMode) count += (1*PacketSize) * offset;
|
||||
|
||||
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
Packet A;
|
||||
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
|
||||
A = lhs.loadPacket(i+0*PacketSize, k);
|
||||
pstore(blockA+count, cj.pconj(A));
|
||||
count+=PacketSize;
|
||||
}
|
||||
@ -1508,11 +1512,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
|
||||
for(; i<peeled_mc0; i+=Pack2)
|
||||
{
|
||||
if(PanelMode) count += Pack2 * offset;
|
||||
|
||||
|
||||
for(Index k=0; k<depth; k++)
|
||||
for(Index w=0; w<Pack2; w++)
|
||||
blockA[count++] = cj(lhs(i+w, k));
|
||||
|
||||
|
||||
if(PanelMode) count += Pack2 * (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
@ -1525,15 +1529,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
||||
{
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
||||
{
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
enum { PacketSize = packet_traits<Scalar>::size };
|
||||
@ -1543,13 +1548,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
|
||||
EIGEN_UNUSED_VARIABLE(offset);
|
||||
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||
const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
|
||||
Index count = 0;
|
||||
|
||||
|
||||
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
||||
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
||||
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
||||
|
||||
|
||||
int pack = Pack1;
|
||||
Index i = 0;
|
||||
while(pack>0)
|
||||
@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
|
||||
for (Index m = 0; m < pack; m += PacketSize)
|
||||
{
|
||||
PacketBlock<Packet> kernel;
|
||||
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
|
||||
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
|
||||
ptranspose(kernel);
|
||||
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
|
||||
}
|
||||
@ -1594,15 +1598,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
|
||||
for(;w<pack;++w)
|
||||
blockA[count++] = cj(lhs(i+w, k));
|
||||
}
|
||||
|
||||
|
||||
if(PanelMode) count += pack * (stride-offset-depth);
|
||||
}
|
||||
|
||||
|
||||
pack -= PacketSize;
|
||||
if(pack<Pack2 && (pack+PacketSize)!=Pack2)
|
||||
pack = Pack2;
|
||||
}
|
||||
|
||||
|
||||
for(; i<rows; i++)
|
||||
{
|
||||
if(PanelMode) count += offset;
|
||||
@ -1619,17 +1623,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
|
||||
// 4 5 6 7 16 17 18 19 25 28
|
||||
// 8 9 10 11 20 21 22 23 26 29
|
||||
// . . . . . . . . . .
|
||||
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
||||
{
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
||||
enum { PacketSize = packet_traits<Scalar>::size };
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
||||
{
|
||||
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
|
||||
EIGEN_UNUSED_VARIABLE(stride);
|
||||
@ -1685,27 +1690,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
||||
// if(PanelMode) count += 8 * (stride-offset-depth);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
if(nr>=4)
|
||||
{
|
||||
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += 4 * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
|
||||
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
|
||||
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
|
||||
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
|
||||
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
|
||||
|
||||
Index k=0;
|
||||
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
|
||||
{
|
||||
for(; k<peeled_k; k+=PacketSize) {
|
||||
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
|
||||
kernel.packet[0] = ploadu<Packet>(&b0[k]);
|
||||
kernel.packet[1] = ploadu<Packet>(&b1[k]);
|
||||
kernel.packet[2] = ploadu<Packet>(&b2[k]);
|
||||
kernel.packet[3] = ploadu<Packet>(&b3[k]);
|
||||
kernel.packet[0] = dm0.loadPacket(k);
|
||||
kernel.packet[1] = dm1.loadPacket(k);
|
||||
kernel.packet[2] = dm2.loadPacket(k);
|
||||
kernel.packet[3] = dm3.loadPacket(k);
|
||||
ptranspose(kernel);
|
||||
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
|
||||
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
|
||||
@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
||||
}
|
||||
for(; k<depth; k++)
|
||||
{
|
||||
blockB[count+0] = cj(b0[k]);
|
||||
blockB[count+1] = cj(b1[k]);
|
||||
blockB[count+2] = cj(b2[k]);
|
||||
blockB[count+3] = cj(b3[k]);
|
||||
blockB[count+0] = cj(dm0(k));
|
||||
blockB[count+1] = cj(dm1(k));
|
||||
blockB[count+2] = cj(dm2(k));
|
||||
blockB[count+3] = cj(dm3(k));
|
||||
count += 4;
|
||||
}
|
||||
// skip what we have after
|
||||
@ -1731,10 +1736,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
||||
for(Index j2=packet_cols4; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
blockB[count] = cj(b0[k]);
|
||||
blockB[count] = cj(dm0(k));
|
||||
count += 1;
|
||||
}
|
||||
if(PanelMode) count += (stride-offset-depth);
|
||||
@ -1742,17 +1747,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
||||
}
|
||||
|
||||
// this version is optimized for row major matrices
|
||||
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
||||
{
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename DataMapper::LinearMapper LinearMapper;
|
||||
enum { PacketSize = packet_traits<Scalar>::size };
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
||||
{
|
||||
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
||||
EIGEN_UNUSED_VARIABLE(stride);
|
||||
@ -1762,7 +1768,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
|
||||
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
||||
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
||||
Index count = 0;
|
||||
|
||||
|
||||
// if(nr>=8)
|
||||
// {
|
||||
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
||||
@ -1805,15 +1811,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
if (PacketSize==4) {
|
||||
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
||||
Packet A = rhs.loadPacket(k, j2);
|
||||
pstoreu(blockB+count, cj.pconj(A));
|
||||
count += PacketSize;
|
||||
} else {
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
blockB[count+0] = cj(b0[0]);
|
||||
blockB[count+1] = cj(b0[1]);
|
||||
blockB[count+2] = cj(b0[2]);
|
||||
blockB[count+3] = cj(b0[3]);
|
||||
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
||||
blockB[count+0] = cj(dm0(0));
|
||||
blockB[count+1] = cj(dm0(1));
|
||||
blockB[count+2] = cj(dm0(2));
|
||||
blockB[count+3] = cj(dm0(3));
|
||||
count += 4;
|
||||
}
|
||||
}
|
||||
@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
|
||||
for(Index j2=packet_cols4; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += offset;
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(Index k=0; k<depth; k++)
|
||||
{
|
||||
blockB[count] = cj(b0[k*rhsStride]);
|
||||
blockB[count] = cj(rhs(k, j2));
|
||||
count += 1;
|
||||
}
|
||||
if(PanelMode) count += stride-offset-depth;
|
||||
|
@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
|
||||
static void run(Index rows, Index cols, Index depth,
|
||||
const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* res, Index resStride,
|
||||
ResScalar* _res, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<LhsScalar,RhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
{
|
||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
if(info)
|
||||
@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
|
||||
// In order to reduce the chance that a thread has to wait for the other,
|
||||
// let's start by packing B'.
|
||||
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
|
||||
|
||||
// Pack A_k to A' in a parallel fashion:
|
||||
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
|
||||
@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
|
||||
while(info[tid].users!=0) {}
|
||||
info[tid].users += threads;
|
||||
|
||||
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
|
||||
|
||||
pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
|
||||
|
||||
// Notify the other threads that the part A'_i is ready to go.
|
||||
info[tid].sync = k;
|
||||
@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// At this point we have to make sure that A'_i has been updated by the thread i,
|
||||
// we use testAndSetOrdered to mimic a volatile access.
|
||||
// However, no need to wait for the B' part which has been updated by the current thread!
|
||||
if(shift>0)
|
||||
while(info[i].sync!=k) {}
|
||||
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||
if (shift>0) {
|
||||
while(info[i].sync!=k) {
|
||||
}
|
||||
}
|
||||
|
||||
gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||
}
|
||||
|
||||
// Then keep going as usual with the remaining B'
|
||||
@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
|
||||
const Index actual_nc = (std::min)(j+nc,cols)-j;
|
||||
|
||||
// pack B_k,j to B'
|
||||
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
|
||||
|
||||
// C_j += A' * B'
|
||||
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
|
||||
// Release all the sub blocks A'_i of A' for the current thread,
|
||||
@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
||||
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||
// horizontal panel which is, in practice, a very low number.
|
||||
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
|
||||
|
||||
// For each kc x nc block of the rhs's horizontal panel...
|
||||
for(Index j2=0; j2<cols; j2+=nc)
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||
// horizontal panel which is, in practice, a very low number.
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
|
||||
|
||||
// For each kc x nc block of the rhs's horizontal panel...
|
||||
for(Index j2=0; j2<cols; j2+=nc)
|
||||
{
|
||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -335,7 +347,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
||||
DenseIndex n = this->m_nc;
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
|
||||
}
|
||||
|
||||
|
||||
m_sizeA = this->m_mc * this->m_kc;
|
||||
m_sizeB = this->m_kc * this->m_nc;
|
||||
}
|
||||
|
@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
{
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
|
||||
{
|
||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = depth; // cache block size along the K direction
|
||||
Index mc = size; // cache block size along the M direction
|
||||
Index nc = size; // cache block size along the N direction
|
||||
@ -75,10 +79,10 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
|
||||
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
|
||||
// note that the actual rhs is the transpose/adjoint of mat
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
|
||||
|
||||
for(Index i2=0; i2<size; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,size)-i2;
|
||||
|
||||
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
// the selected actual_mc * size panel of res is split into three different part:
|
||||
// 1 - before the diagonal => processed with gebp or skipped
|
||||
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
|
||||
// 3 - after the diagonal => processed with gebp or skipped
|
||||
if (UpLo==Lower)
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
|
||||
(std::min)(size,i2), alpha, -1, -1, 0, 0);
|
||||
|
||||
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
if (UpLo==Upper)
|
||||
{
|
||||
Index j2 = i2+actual_mc;
|
||||
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
|
||||
actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -129,13 +134,16 @@ struct tribb_kernel
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
|
||||
typedef typename Traits::ResScalar ResScalar;
|
||||
|
||||
|
||||
enum {
|
||||
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
|
||||
};
|
||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
{
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
|
||||
ResMapper res(_res, resStride);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||
|
||||
// let's process the block per panel of actual_mc x BlockSize,
|
||||
@ -146,7 +154,7 @@ struct tribb_kernel
|
||||
const RhsScalar* actual_b = blockB+j*depth;
|
||||
|
||||
if(UpLo==Upper)
|
||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// selfadjoint micro block
|
||||
@ -154,12 +162,12 @@ struct tribb_kernel
|
||||
Index i = j;
|
||||
buffer.setZero();
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
// 2 - triangular accumulation
|
||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
ResScalar* r = res + (j+j1)*resStride + i;
|
||||
ResScalar* r = &res(i, j + j1);
|
||||
for(Index i1=UpLo==Lower ? j1 : 0;
|
||||
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
|
||||
r[i1] += buffer(i1,j1);
|
||||
@ -169,8 +177,8 @@ struct tribb_kernel
|
||||
if(UpLo==Lower)
|
||||
{
|
||||
Index i = j+actualBlockSize;
|
||||
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
depth, actualBlockSize, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -324,16 +324,22 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha)
|
||||
{
|
||||
Index size = rows;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
Index nc = cols; // cache block size along the N direction
|
||||
@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||
Scalar* blockB = allocatedBlockB;
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
|
||||
// transposed packed copy
|
||||
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(k2, i2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
// symmetric packed copy
|
||||
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
|
||||
for(Index i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,size)-i2;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha)
|
||||
{
|
||||
Index size = cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
ResMapper res(_res,resStride);
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
Index nc = cols; // cache block size along the N direction
|
||||
@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||
Scalar* blockB = allocatedBlockB;
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
||||
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
Index depth = IsLower ? diagSize : _depth;
|
||||
Index cols = _cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
else
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=IsLower ? depth : 0;
|
||||
IsLower ? k2>0 : k2<depth;
|
||||
@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
k2 = k2+actual_kc-kc;
|
||||
}
|
||||
|
||||
pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
|
||||
pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
|
||||
|
||||
// the selected lhs's panel has to be split in three different parts:
|
||||
// 1 - the part which is zero => skip it
|
||||
@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
|
||||
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
|
||||
}
|
||||
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
|
||||
pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
|
||||
|
||||
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||
gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
|
||||
actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
|
||||
// GEBP with remaining micro panel
|
||||
@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
{
|
||||
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
|
||||
|
||||
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||
pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
|
||||
gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
|
||||
lengthTarget, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
for(Index i2=start; i2<end; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,end)-i2;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
|
||||
actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index depth = IsLower ? _depth : diagSize;
|
||||
Index cols = IsLower ? diagSize : _cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
else
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? 0 : depth;
|
||||
IsLower ? k2<depth : k2>0;
|
||||
@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Scalar* geb = blockB+ts*ts;
|
||||
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
|
||||
|
||||
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
|
||||
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
|
||||
|
||||
// pack the triangular part of the rhs padding the unrolled blocks with zeros
|
||||
if(ts>0)
|
||||
@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
|
||||
// general part
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), rhsStride,
|
||||
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
|
||||
@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
}
|
||||
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
triangularBuffer.data(), triangularBuffer.outerStride(),
|
||||
RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
|
||||
actualPanelWidth, actualPanelWidth,
|
||||
actual_kc, j2);
|
||||
}
|
||||
@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
for (Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(mc,rows-i2);
|
||||
pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
|
||||
|
||||
// triangular kernel
|
||||
if(ts>0)
|
||||
@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
|
||||
Index blockOffset = IsLower ? j2 : 0;
|
||||
|
||||
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
|
||||
gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
alpha,
|
||||
@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
blockOffset, blockOffset);// offsets
|
||||
}
|
||||
}
|
||||
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
||||
gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
|
||||
blockA, geb, actual_mc, actual_kc, rs,
|
||||
alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index cols = otherSize;
|
||||
const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
|
||||
blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
|
||||
TriMapper tri(_tri, triStride);
|
||||
OtherMapper other(_other, otherStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
enum {
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
|
||||
|
||||
// the goal here is to subdivise the Rhs panels such that we keep some cache
|
||||
// coherence when accessing the rhs elements
|
||||
@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
Index blockBOffset = IsLower ? k1 : lengthTarget;
|
||||
|
||||
// update the respective rows of B from other
|
||||
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
|
||||
pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
|
||||
|
||||
// GEBP
|
||||
if (lengthTarget>0)
|
||||
{
|
||||
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
|
||||
|
||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||
pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
const Index actual_mc = (std::min)(mc,end-i2);
|
||||
if (actual_mc>0)
|
||||
{
|
||||
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||
gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index rows = otherSize;
|
||||
const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
|
||||
blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
|
||||
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
|
||||
LhsMapper lhs(_other, otherStride);
|
||||
RhsMapper rhs(_tri, triStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
|
||||
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? size : 0;
|
||||
IsLower ? k2>0 : k2<size;
|
||||
@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc;
|
||||
|
||||
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
|
||||
if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
|
||||
|
||||
// triangular packing (we only pack the panels off the diagonal,
|
||||
// neglecting the blocks overlapping the diagonal
|
||||
@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
|
||||
if (panelLength>0)
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), triStride,
|
||||
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
}
|
||||
@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
// GEBP
|
||||
if(panelLength>0)
|
||||
{
|
||||
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
|
||||
gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
Scalar(-1),
|
||||
@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
}
|
||||
|
||||
// pack the just computed part of lhs to A
|
||||
pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
|
||||
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
|
||||
actualPanelWidth, actual_mc,
|
||||
actual_kc, j2);
|
||||
}
|
||||
}
|
||||
|
||||
if (rs>0)
|
||||
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
|
||||
gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
|
||||
actual_mc, actual_kc, rs, Scalar(-1),
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
|
@ -18,13 +18,13 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// forward declarations
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
|
||||
struct gebp_kernel;
|
||||
|
||||
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
|
||||
struct gemm_pack_rhs;
|
||||
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
|
||||
struct gemm_pack_lhs;
|
||||
|
||||
template<
|
||||
@ -117,32 +117,96 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
|
||||
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
|
||||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
|
||||
// and so I hope better compilation performance (time and code quality).
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class blas_data_mapper
|
||||
{
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType>
|
||||
class MatrixLinearMapper {
|
||||
public:
|
||||
blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
|
||||
return m_data[i];
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
|
||||
return ploadt<Packet, AlignmentType>(m_data + i);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
|
||||
return ploadt<HalfPacket, AlignmentType>(m_data + i);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
|
||||
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
Index m_stride;
|
||||
Scalar *m_data;
|
||||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
|
||||
class blas_data_mapper {
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
typedef MatrixLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
|
||||
|
||||
EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
|
||||
getSubMapper(Index i, Index j) const {
|
||||
return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
|
||||
return LinearMapper(&operator()(i, j));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
|
||||
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
return ploadt<Packet, AlignmentType>(&operator()(i, j));
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
|
||||
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
|
||||
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
const Index m_stride;
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients (const version)
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class const_blas_data_mapper
|
||||
{
|
||||
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
|
||||
public:
|
||||
const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
protected:
|
||||
const Scalar* EIGEN_RESTRICT m_data;
|
||||
Index m_stride;
|
||||
EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
|
||||
return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ if(MPFR_FOUND)
|
||||
include_directories(${MPFR_INCLUDES} ./mpreal)
|
||||
ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
|
||||
set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
|
||||
ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
|
||||
# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
|
||||
else()
|
||||
ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
|
||||
endif()
|
||||
|
Loading…
x
Reference in New Issue
Block a user