Generalized the gebp apis

This commit is contained in:
Benoit Steiner 2014-10-02 16:51:57 -07:00
parent 8b2afe33a1
commit b7271dffb5
8 changed files with 474 additions and 358 deletions

View File

@ -667,7 +667,7 @@ protected:
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@ -684,6 +684,7 @@ struct gebp_kernel
typedef typename SwappedTraits::ResPacket SResPacket;
typedef typename SwappedTraits::AccPacket SAccPacket;
typedef typename DataMapper::LinearMapper LinearMapper;
enum {
Vectorizable = Traits::Vectorizable,
@ -693,14 +694,16 @@ struct gebp_kernel
};
EIGEN_DONT_INLINE
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
Traits traits;
@ -743,15 +746,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
internal::prefetch(r0);
internal::prefetch(r1);
internal::prefetch(r2);
internal::prefetch(r3);
r0.prefetch(0);
r1.prefetch(0);
r2.prefetch(0);
r3.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -814,45 +817,45 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r0+2*Traits::ResPacketSize, R2);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r0.storePacket(2 * Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C1, alphav, R0);
traits.acc(C5, alphav, R1);
traits.acc(C9, alphav, R2);
pstoreu(r1+0*Traits::ResPacketSize, R0);
pstoreu(r1+1*Traits::ResPacketSize, R1);
pstoreu(r1+2*Traits::ResPacketSize, R2);
r1.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(1 * Traits::ResPacketSize, R1);
r1.storePacket(2 * Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C10, alphav, R2);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r2+1*Traits::ResPacketSize, R1);
pstoreu(r2+2*Traits::ResPacketSize, R2);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r2.storePacket(1 * Traits::ResPacketSize, R1);
r2.storePacket(2 * Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C3, alphav, R0);
traits.acc(C7, alphav, R1);
traits.acc(C11, alphav, R2);
pstoreu(r3+0*Traits::ResPacketSize, R0);
pstoreu(r3+1*Traits::ResPacketSize, R1);
pstoreu(r3+2*Traits::ResPacketSize, R2);
r3.storePacket(0 * Traits::ResPacketSize, R0);
r3.storePacket(1 * Traits::ResPacketSize, R1);
r3.storePacket(2 * Traits::ResPacketSize, R2);
}
// Deal with remaining columns of the rhs
@ -868,7 +871,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4);
traits.initAcc(C8);
ResScalar* r0 = &res[(j2+0)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2);
r0.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -912,15 +916,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r0+2*Traits::ResPacketSize, R2);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r0.storePacket(2 * Traits::ResPacketSize, R2);
}
}
}
@ -946,15 +950,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
internal::prefetch(r0+prefetch_res_offset);
internal::prefetch(r1+prefetch_res_offset);
internal::prefetch(r2+prefetch_res_offset);
internal::prefetch(r3+prefetch_res_offset);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -1006,31 +1010,31 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2, R3;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C1, alphav, R2);
traits.acc(C5, alphav, R3);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r1+0*Traits::ResPacketSize, R2);
pstoreu(r1+1*Traits::ResPacketSize, R3);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r1.storePacket(0 * Traits::ResPacketSize, R2);
r1.storePacket(1 * Traits::ResPacketSize, R3);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C3, alphav, R2);
traits.acc(C7, alphav, R3);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r2+1*Traits::ResPacketSize, R1);
pstoreu(r3+0*Traits::ResPacketSize, R2);
pstoreu(r3+1*Traits::ResPacketSize, R3);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r2.storePacket(1 * Traits::ResPacketSize, R1);
r3.storePacket(0 * Traits::ResPacketSize, R2);
r3.storePacket(1 * Traits::ResPacketSize, R3);
}
// Deal with remaining columns of the rhs
@ -1045,8 +1049,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0);
traits.initAcc(C4);
ResScalar* r0 = &res[(j2+0)*resStride + i];
internal::prefetch(r0+prefetch_res_offset);
LinearMapper r0 = res.getLinearMapper(i, j2);
r0.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -1089,12 +1093,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
}
}
}
@ -1120,15 +1124,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C2);
traits.initAcc(C3);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
internal::prefetch(r0+prefetch_res_offset);
internal::prefetch(r1+prefetch_res_offset);
internal::prefetch(r2+prefetch_res_offset);
internal::prefetch(r3+prefetch_res_offset);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -1175,19 +1179,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r1+0*Traits::ResPacketSize, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(0 * Traits::ResPacketSize, R1);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r3+0*Traits::ResPacketSize, R1);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r3.storePacket(0 * Traits::ResPacketSize, R1);
}
// Deal with remaining columns of the rhs
@ -1201,7 +1205,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
AccPacket C0;
traits.initAcc(C0);
ResScalar* r0 = &res[(j2+0)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -1241,9 +1245,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
pstoreu(r0+0*Traits::ResPacketSize, R0);
r0.storePacket(0 * Traits::ResPacketSize, R0);
}
}
}
@ -1318,7 +1322,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
if(depth-endk>0)
@ -1336,14 +1340,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
straits.acc(predux4(C0), alphav, R);
}
pscatter(&res[j2*resStride + i], R, resStride);
res.scatterPacket(i, j2, R);
}
else
{
SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
SResPacket alphav = pset1<SResPacket>(alpha);
straits.acc(C0, alphav, R);
pscatter(&res[j2*resStride + i], R, resStride);
res.scatterPacket(i, j2, R);
}
}
else // scalar path
@ -1370,10 +1374,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += 4;
}
res[(j2+0)*resStride + i] += alpha*C0;
res[(j2+1)*resStride + i] += alpha*C1;
res[(j2+2)*resStride + i] += alpha*C2;
res[(j2+3)*resStride + i] += alpha*C3;
res(i, j2 + 0) += alpha * C0;
res(i, j2 + 1) += alpha * C1;
res(i, j2 + 2) += alpha * C2;
res(i, j2 + 3) += alpha * C3;
}
}
}
@ -1394,7 +1398,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
RhsScalar B_0 = blB[k];
MADD(cj, A0, B_0, C0, B_0);
}
res[(j2+0)*resStride + i] += alpha*C0;
res(i, j2) += alpha * C0;
}
}
}
@ -1417,15 +1421,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
{
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@ -1436,7 +1441,6 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
Index count = 0;
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
@ -1457,9 +1461,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(Index k=0; k<depth; k++)
{
Packet A, B, C;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
C = lhs.loadPacket(i+2*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@ -1477,8 +1481,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(Index k=0; k<depth; k++)
{
Packet A, B;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
}
@ -1495,7 +1499,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(Index k=0; k<depth; k++)
{
Packet A;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
pstore(blockA+count, cj.pconj(A));
count+=PacketSize;
}
@ -1525,15 +1529,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
}
}
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
{
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@ -1543,7 +1548,6 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
EIGEN_UNUSED_VARIABLE(offset);
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
Index count = 0;
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
for (Index m = 0; m < pack; m += PacketSize)
{
PacketBlock<Packet> kernel;
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
}
@ -1619,17 +1623,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@ -1692,20 +1697,20 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
{
// skip what we have before
if(PanelMode) count += 4 * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
Index k=0;
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
{
for(; k<peeled_k; k+=PacketSize) {
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
kernel.packet[0] = ploadu<Packet>(&b0[k]);
kernel.packet[1] = ploadu<Packet>(&b1[k]);
kernel.packet[2] = ploadu<Packet>(&b2[k]);
kernel.packet[3] = ploadu<Packet>(&b3[k]);
kernel.packet[0] = dm0.loadPacket(k);
kernel.packet[1] = dm1.loadPacket(k);
kernel.packet[2] = dm2.loadPacket(k);
kernel.packet[3] = dm3.loadPacket(k);
ptranspose(kernel);
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
blockB[count+2] = cj(b2[k]);
blockB[count+3] = cj(b3[k]);
blockB[count+0] = cj(dm0(k));
blockB[count+1] = cj(dm1(k));
blockB[count+2] = cj(dm2(k));
blockB[count+3] = cj(dm3(k));
count += 4;
}
// skip what we have after
@ -1731,10 +1736,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k]);
blockB[count] = cj(dm0(k));
count += 1;
}
if(PanelMode) count += (stride-offset-depth);
@ -1742,17 +1747,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
// this version is optimized for row major matrices
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@ -1805,15 +1811,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index k=0; k<depth; k++)
{
if (PacketSize==4) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
Packet A = rhs.loadPacket(k, j2);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
} else {
const Scalar* b0 = &rhs[k*rhsStride + j2];
blockB[count+0] = cj(b0[0]);
blockB[count+1] = cj(b0[1]);
blockB[count+2] = cj(b0[2]);
blockB[count+3] = cj(b0[3]);
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
blockB[count+0] = cj(dm0(0));
blockB[count+1] = cj(dm0(1));
blockB[count+2] = cj(dm0(2));
blockB[count+3] = cj(dm0(3));
count += 4;
}
}
@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[j2];
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k*rhsStride]);
blockB[count] = cj(rhs(k, j2));
count += 1;
}
if(PanelMode) count += stride-offset-depth;

View File

@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride,
ResScalar* res, Index resStride,
ResScalar* _res, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing B'.
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
// Pack A_k to A' in a parallel fashion:
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
@ -106,7 +110,7 @@ static void run(Index rows, Index cols, Index depth,
while(info[tid].users!=0) {}
info[tid].users += threads;
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
// Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k;
@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
// At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0)
while(info[i].sync!=k) {}
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
if (shift>0) {
while(info[i].sync!=k) {
}
}
gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
}
// Then keep going as usual with the remaining B'
@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
const Index actual_nc = (std::min)(j+nc,cols)-j;
// pack B_k,j to B'
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
// C_j += A' * B'
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
}
// Release all the sub blocks A'_i of A' for the current thread,
@ -159,6 +166,10 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
@ -167,7 +178,7 @@ static void run(Index rows, Index cols, Index depth,
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
@ -177,10 +188,11 @@ static void run(Index rows, Index cols, Index depth,
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
}
}
}
}

View File

@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
@ -76,9 +80,9 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
}
}
}
@ -133,9 +138,12 @@ struct tribb_kernel
enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
ResMapper res(_res, resStride);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
@ -146,7 +154,7 @@ struct tribb_kernel
const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// selfadjoint micro block
@ -154,12 +162,12 @@ struct tribb_kernel
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{
ResScalar* r = res + (j+j1)*resStride + i;
ResScalar* r = &res(i, j + j1);
for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1);
@ -169,8 +177,8 @@ struct tribb_kernel
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
depth, actualBlockSize, alpha, -1, -1, 0, 0);
}
}
}

View File

@ -324,16 +324,22 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = rows;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(k2, i2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
ResMapper res(_res,resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}

View File

@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index depth = IsLower ? diagSize : _depth;
Index cols = _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
k2 = k2+actual_kc-kc;
}
pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
// the selected lhs's panel has to be split in three different parts:
// 1 - the part which is zero => skip it
@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
}
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
{
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,end)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
actual_kc, cols, alpha, -1, -1, 0, 0);
}
}
}
@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index depth = IsLower ? _depth : diagSize;
Index cols = IsLower ? diagSize : _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;
@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* geb = blockB+ts*ts;
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
// pack the triangular part of the rhs padding the unrolled blocks with zeros
if(ts>0)
@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), rhsStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
pack_rhs_panel(blockB+j2*actual_kc,
triangularBuffer.data(), triangularBuffer.outerStride(),
RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
}
@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
for (Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(mc,rows-i2);
pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
// triangular kernel
if(ts>0)
@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
Index blockOffset = IsLower ? j2 : 0;
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
alpha,
@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
blockOffset, blockOffset);// offsets
}
}
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
blockA, geb, actual_mc, actual_kc, rs,
alpha,
-1, -1, 0, 0);

View File

@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
level3_blocking<Scalar,Scalar>& blocking)
{
Index cols = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
TriMapper tri(_tri, triStride);
OtherMapper other(_other, otherStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
// the goal here is to subdivise the Rhs panels such that we keep some cache
// coherence when accessing the rhs elements
@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
{
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
const Index actual_mc = (std::min)(mc,end-i2);
if (actual_mc>0)
{
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
}
}
}
@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
level3_blocking<Scalar,Scalar>& blocking)
{
Index rows = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
LhsMapper lhs(_other, otherStride);
RhsMapper rhs(_tri, triStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;
@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc;
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
// triangular packing (we only pack the panels off the diagonal,
// neglecting the blocks overlapping the diagonal
@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), triStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
}
@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
// GEBP
if(panelLength>0)
{
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
Scalar(-1),
@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
}
// pack the just computed part of lhs to A
pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
actualPanelWidth, actual_mc,
actual_kc, j2);
}
}
if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0);
}

View File

@ -18,13 +18,13 @@ namespace Eigen {
namespace internal {
// forward declarations
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct gebp_kernel;
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs;
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs;
template<
@ -117,32 +117,96 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
};
// Lightweight helper class to access matrix coefficients.
// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
// and so I hope better compilation performance (time and code quality).
template<typename Scalar, typename Index, int StorageOrder>
class blas_data_mapper
{
template<typename Scalar, typename Index, int AlignmentType>
class MatrixLinearMapper {
public:
blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {}
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
return m_data[i];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
return ploadt<HalfPacket, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
}
protected:
Scalar *m_data;
};
// Lightweight helper class to access matrix coefficients.
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
class blas_data_mapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef MatrixLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
getSubMapper(Index i, Index j) const {
return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
}
EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(&operator()(i, j));
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return ploadt<Packet, AlignmentType>(&operator()(i, j));
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
}
protected:
Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
const Index m_stride;
};
// lightweight helper class to access matrix coefficients (const version)
template<typename Scalar, typename Index, int StorageOrder>
class const_blas_data_mapper
{
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
public:
const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
protected:
const Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
}
};

View File

@ -48,7 +48,7 @@ if(MPFR_FOUND)
include_directories(${MPFR_INCLUDES} ./mpreal)
ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
else()
ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
endif()