mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 11:49:02 +08:00
merge
This commit is contained in:
commit
3beedba244
@ -27,6 +27,12 @@
|
|||||||
|
|
||||||
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_FUSE_CJMADD
|
||||||
|
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
|
||||||
|
#else
|
||||||
|
#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
|
||||||
|
#endif
|
||||||
|
|
||||||
// optimized GEneral packed Block * packed Panel product kernel
|
// optimized GEneral packed Block * packed Panel product kernel
|
||||||
template<typename Scalar, int mr, int nr, typename Conj>
|
template<typename Scalar, int mr, int nr, typename Conj>
|
||||||
struct ei_gebp_kernel
|
struct ei_gebp_kernel
|
||||||
@ -74,65 +80,111 @@ struct ei_gebp_kernel
|
|||||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||||
for(int k=0; k<peeled_kc; k+=4)
|
for(int k=0; k<peeled_kc; k+=4)
|
||||||
{
|
{
|
||||||
PacketType B0, B1, B2, B3, A0, A1;
|
if(nr==2)
|
||||||
|
{
|
||||||
|
PacketType B0, T0, A0, A1;
|
||||||
|
|
||||||
A0 = ei_pload(&blA[0*PacketSize]);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
A1 = ei_pload(&blA[1*PacketSize]);
|
A1 = ei_pload(&blA[1*PacketSize]);
|
||||||
B0 = ei_pload(&blB[0*PacketSize]);
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
B1 = ei_pload(&blB[1*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
CJMADD(A1,B0,C4,T0);
|
||||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
B0 = ei_pload(&blB[1*PacketSize]);
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
CJMADD(A1,B0,C5,T0);
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
|
||||||
C5 = cj.pmadd(A1, B1, C5);
|
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
|
||||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
|
||||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
|
||||||
A0 = ei_pload(&blA[2*PacketSize]);
|
|
||||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
|
||||||
A1 = ei_pload(&blA[3*PacketSize]);
|
|
||||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
|
||||||
C5 = cj.pmadd(A1, B1, C5);
|
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
|
||||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
|
||||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
|
||||||
A0 = ei_pload(&blA[4*PacketSize]);
|
|
||||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
|
||||||
A1 = ei_pload(&blA[5*PacketSize]);
|
|
||||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
|
||||||
|
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
A0 = ei_pload(&blA[2*PacketSize]);
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
A1 = ei_pload(&blA[3*PacketSize]);
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
B0 = ei_pload(&blB[2*PacketSize]);
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C5 = cj.pmadd(A1, B1, C5);
|
CJMADD(A1,B0,C4,T0);
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
B0 = ei_pload(&blB[3*PacketSize]);
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
CJMADD(A1,B0,C5,T0);
|
||||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
A0 = ei_pload(&blA[4*PacketSize]);
|
||||||
A0 = ei_pload(&blA[6*PacketSize]);
|
A1 = ei_pload(&blA[5*PacketSize]);
|
||||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
B0 = ei_pload(&blB[4*PacketSize]);
|
||||||
A1 = ei_pload(&blA[7*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
CJMADD(A1,B0,C4,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
B0 = ei_pload(&blB[5*PacketSize]);
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
CJMADD(A0,B0,C1,T0);
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
CJMADD(A1,B0,C5,T0);
|
||||||
C5 = cj.pmadd(A1, B1, C5);
|
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
A0 = ei_pload(&blA[6*PacketSize]);
|
||||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
A1 = ei_pload(&blA[7*PacketSize]);
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
B0 = ei_pload(&blB[6*PacketSize]);
|
||||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A1,B0,C4,T0);
|
||||||
|
B0 = ei_pload(&blB[7*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C1,T0);
|
||||||
|
CJMADD(A1,B0,C5,T0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
|
||||||
|
PacketType B0, B1, B2, B3, A0, A1;
|
||||||
|
PacketType T0, T1;
|
||||||
|
|
||||||
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
|
A1 = ei_pload(&blA[1*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
|
B1 = ei_pload(&blB[1*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||||
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T0);
|
||||||
|
CJMADD(A1,B1,C5,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||||
|
A0 = ei_pload(&blA[2*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||||
|
A1 = ei_pload(&blA[3*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T0);
|
||||||
|
CJMADD(A1,B1,C5,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||||
|
A0 = ei_pload(&blA[4*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||||
|
A1 = ei_pload(&blA[5*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T0);
|
||||||
|
CJMADD(A1,B1,C5,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||||
|
A0 = ei_pload(&blA[6*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||||
|
A1 = ei_pload(&blA[7*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
CJMADD(A0,B1,C1,T0);
|
||||||
|
CJMADD(A1,B1,C5,T1);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||||
|
}
|
||||||
|
|
||||||
blB += 4*nr*PacketSize;
|
blB += 4*nr*PacketSize;
|
||||||
blA += 4*mr;
|
blA += 4*mr;
|
||||||
@ -140,22 +192,40 @@ struct ei_gebp_kernel
|
|||||||
// process remaining peeled loop
|
// process remaining peeled loop
|
||||||
for(int k=peeled_kc; k<depth; k++)
|
for(int k=peeled_kc; k<depth; k++)
|
||||||
{
|
{
|
||||||
PacketType B0, B1, B2, B3, A0, A1;
|
if(nr==2)
|
||||||
|
{
|
||||||
|
PacketType B0, T0, A0, A1;
|
||||||
|
|
||||||
A0 = ei_pload(&blA[0*PacketSize]);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
A1 = ei_pload(&blA[1*PacketSize]);
|
A1 = ei_pload(&blA[1*PacketSize]);
|
||||||
B0 = ei_pload(&blB[0*PacketSize]);
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
B1 = ei_pload(&blB[1*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
CJMADD(A1,B0,C4,T0);
|
||||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
B0 = ei_pload(&blB[1*PacketSize]);
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
CJMADD(A1,B0,C5,T0);
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
}
|
||||||
C5 = cj.pmadd(A1, B1, C5);
|
else
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
{
|
||||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
PacketType B0, B1, B2, B3, A0, A1, T0, T1;
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
|
||||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
|
A1 = ei_pload(&blA[1*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
|
B1 = ei_pload(&blB[1*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||||
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T0);
|
||||||
|
CJMADD(A1,B1,C5,T1);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||||
|
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||||
|
}
|
||||||
|
|
||||||
blB += nr*PacketSize;
|
blB += nr*PacketSize;
|
||||||
blA += mr;
|
blA += mr;
|
||||||
@ -189,45 +259,79 @@ struct ei_gebp_kernel
|
|||||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||||
for(int k=0; k<peeled_kc; k+=4)
|
for(int k=0; k<peeled_kc; k+=4)
|
||||||
{
|
{
|
||||||
PacketType B0, B1, B2, B3, A0;
|
if(nr==2)
|
||||||
|
{
|
||||||
|
PacketType B0, T0, A0;
|
||||||
|
|
||||||
A0 = ei_pload(&blA[0*PacketSize]);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
B0 = ei_pload(&blB[0*PacketSize]);
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
B1 = ei_pload(&blB[1*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
B0 = ei_pload(&blB[1*PacketSize]);
|
||||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
|
||||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
|
||||||
A0 = ei_pload(&blA[1*PacketSize]);
|
|
||||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
|
||||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
|
||||||
A0 = ei_pload(&blA[2*PacketSize]);
|
|
||||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
|
||||||
|
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
A0 = ei_pload(&blA[1*PacketSize]);
|
||||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
B0 = ei_pload(&blB[2*PacketSize]);
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
CJMADD(A0,B0,C0,T0);
|
||||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
B0 = ei_pload(&blB[3*PacketSize]);
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
A0 = ei_pload(&blA[2*PacketSize]);
|
||||||
A0 = ei_pload(&blA[3*PacketSize]);
|
B0 = ei_pload(&blB[4*PacketSize]);
|
||||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
B0 = ei_pload(&blB[5*PacketSize]);
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
A0 = ei_pload(&blA[3*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[6*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
B0 = ei_pload(&blB[7*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C1,T0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
|
||||||
|
PacketType B0, B1, B2, B3, A0;
|
||||||
|
PacketType T0, T1;
|
||||||
|
|
||||||
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
|
B1 = ei_pload(&blB[1*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
A0 = ei_pload(&blA[1*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
A0 = ei_pload(&blA[2*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
A0 = ei_pload(&blA[3*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
}
|
||||||
|
|
||||||
blB += 4*nr*PacketSize;
|
blB += 4*nr*PacketSize;
|
||||||
blA += 4*PacketSize;
|
blA += 4*PacketSize;
|
||||||
@ -235,17 +339,32 @@ struct ei_gebp_kernel
|
|||||||
// process remaining peeled loop
|
// process remaining peeled loop
|
||||||
for(int k=peeled_kc; k<depth; k++)
|
for(int k=peeled_kc; k<depth; k++)
|
||||||
{
|
{
|
||||||
PacketType B0, B1, B2, B3, A0;
|
if(nr==2)
|
||||||
|
{
|
||||||
|
PacketType B0, T0, A0;
|
||||||
|
|
||||||
A0 = ei_pload(&blA[0*PacketSize]);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
B0 = ei_pload(&blB[0*PacketSize]);
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
B1 = ei_pload(&blB[1*PacketSize]);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
B0 = ei_pload(&blB[1*PacketSize]);
|
||||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
}
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
else
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
{
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
PacketType B0, B1, B2, B3, A0;
|
||||||
|
PacketType T0, T1;
|
||||||
|
|
||||||
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
|
B1 = ei_pload(&blB[1*PacketSize]);
|
||||||
|
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||||
|
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
}
|
||||||
|
|
||||||
blB += nr*PacketSize;
|
blB += nr*PacketSize;
|
||||||
blA += PacketSize;
|
blA += PacketSize;
|
||||||
@ -268,17 +387,32 @@ struct ei_gebp_kernel
|
|||||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
Scalar B0, B1, B2, B3, A0;
|
if(nr==2)
|
||||||
|
{
|
||||||
|
Scalar B0, T0, A0;
|
||||||
|
|
||||||
A0 = blA[k];
|
A0 = blA[0*PacketSize];
|
||||||
B0 = blB[0*PacketSize];
|
B0 = blB[0*PacketSize];
|
||||||
B1 = blB[1*PacketSize];
|
CJMADD(A0,B0,C0,T0);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
B0 = blB[1*PacketSize];
|
||||||
if(nr==4) B2 = blB[2*PacketSize];
|
CJMADD(A0,B0,C1,T0);
|
||||||
if(nr==4) B3 = blB[3*PacketSize];
|
}
|
||||||
C1 = cj.pmadd(A0, B1, C1);
|
else
|
||||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
{
|
||||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
Scalar B0, B1, B2, B3, A0;
|
||||||
|
Scalar T0, T1;
|
||||||
|
|
||||||
|
A0 = blA[k];
|
||||||
|
B0 = blB[0*PacketSize];
|
||||||
|
B1 = blB[1*PacketSize];
|
||||||
|
if(nr==4) B2 = blB[2*PacketSize];
|
||||||
|
if(nr==4) B3 = blB[3*PacketSize];
|
||||||
|
|
||||||
|
CJMADD(A0,B0,C0,T0);
|
||||||
|
CJMADD(A0,B1,C1,T1);
|
||||||
|
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||||
|
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||||
|
}
|
||||||
|
|
||||||
blB += nr*PacketSize;
|
blB += nr*PacketSize;
|
||||||
}
|
}
|
||||||
@ -310,13 +444,13 @@ struct ei_gebp_kernel
|
|||||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
PacketType B0, A0, A1;
|
PacketType B0, A0, A1, T0, T1;
|
||||||
|
|
||||||
A0 = ei_pload(&blA[0*PacketSize]);
|
A0 = ei_pload(&blA[0*PacketSize]);
|
||||||
A1 = ei_pload(&blA[1*PacketSize]);
|
A1 = ei_pload(&blA[1*PacketSize]);
|
||||||
B0 = ei_pload(&blB[0*PacketSize]);
|
B0 = ei_pload(&blB[0*PacketSize]);
|
||||||
C0 = cj.pmadd(A0, B0, C0);
|
CJMADD(A0,B0,C0,T0);
|
||||||
C4 = cj.pmadd(A1, B0, C4);
|
CJMADD(A1,B0,C4,T1);
|
||||||
|
|
||||||
blB += PacketSize;
|
blB += PacketSize;
|
||||||
blA += mr;
|
blA += mr;
|
||||||
@ -334,7 +468,7 @@ struct ei_gebp_kernel
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||||
|
|
||||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||||
for(int k=0; k<depth; k++)
|
for(int k=0; k<depth; k++)
|
||||||
{
|
{
|
||||||
@ -363,6 +497,8 @@ struct ei_gebp_kernel
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#undef CJMADD
|
||||||
|
|
||||||
// pack a block of the lhs
|
// pack a block of the lhs
|
||||||
// The travesal is as follow (mr==4):
|
// The travesal is as follow (mr==4):
|
||||||
// 0 4 8 12 ...
|
// 0 4 8 12 ...
|
||||||
@ -474,7 +610,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
|
|||||||
// skip what we have after
|
// skip what we have after
|
||||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the remaining columns one at a time (nr==1)
|
// copy the remaining columns one at a time (nr==1)
|
||||||
for(int j2=packet_cols; j2<cols; ++j2)
|
for(int j2=packet_cols; j2<cols; ++j2)
|
||||||
{
|
{
|
||||||
|
@ -23,24 +23,31 @@
|
|||||||
// License and a copy of the GNU General Public License along with
|
// License and a copy of the GNU General Public License along with
|
||||||
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
#ifndef EIGEN_BENCH_TIMER_H
|
#ifndef EIGEN_BENCH_TIMERR_H
|
||||||
#define EIGEN_BENCH_TIMER_H
|
#define EIGEN_BENCH_TIMERR_H
|
||||||
|
|
||||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#else
|
#else
|
||||||
|
#include <sys/time.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
|
||||||
namespace Eigen
|
namespace Eigen
|
||||||
{
|
{
|
||||||
|
|
||||||
|
enum {
|
||||||
|
CPU_TIMER = 0,
|
||||||
|
REAL_TIMER = 1
|
||||||
|
};
|
||||||
|
|
||||||
/** Elapsed time timer keeping the best try.
|
/** Elapsed time timer keeping the best try.
|
||||||
*
|
*
|
||||||
* On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID.
|
* On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID.
|
||||||
@ -52,37 +59,58 @@ class BenchTimer
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
BenchTimer()
|
BenchTimer()
|
||||||
{
|
{
|
||||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||||
LARGE_INTEGER freq;
|
LARGE_INTEGER freq;
|
||||||
QueryPerformanceFrequency(&freq);
|
QueryPerformanceFrequency(&freq);
|
||||||
m_frequency = (double)freq.QuadPart;
|
m_frequency = (double)freq.QuadPart;
|
||||||
#endif
|
#endif
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
~BenchTimer() {}
|
~BenchTimer() {}
|
||||||
|
|
||||||
inline void reset(void) {m_best = 1e6;}
|
inline void reset()
|
||||||
inline void start(void) {m_start = getTime();}
|
|
||||||
inline void stop(void)
|
|
||||||
{
|
{
|
||||||
m_best = std::min(m_best, getTime() - m_start);
|
m_bests.fill(1e9);
|
||||||
|
m_totals.setZero();
|
||||||
|
}
|
||||||
|
inline void start()
|
||||||
|
{
|
||||||
|
m_starts[CPU_TIMER] = getCpuTime();
|
||||||
|
m_starts[REAL_TIMER] = getRealTime();
|
||||||
|
}
|
||||||
|
inline void stop()
|
||||||
|
{
|
||||||
|
m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER];
|
||||||
|
m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER];
|
||||||
|
m_bests = m_bests.cwiseMin(m_times);
|
||||||
|
m_totals += m_times;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return the best elapsed time in seconds.
|
/** Return the elapsed time in seconds between the last start/stop pair
|
||||||
*/
|
*/
|
||||||
inline double value(void)
|
inline double value(int TIMER = CPU_TIMER)
|
||||||
{
|
{
|
||||||
return m_best;
|
return m_times[TIMER];
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
/** Return the best elapsed time in seconds
|
||||||
inline double getTime(void)
|
*/
|
||||||
#else
|
inline double best(int TIMER = CPU_TIMER)
|
||||||
static inline double getTime(void)
|
{
|
||||||
#endif
|
return m_bests[TIMER];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the total elapsed time in seconds.
|
||||||
|
*/
|
||||||
|
inline double total(int TIMER = CPU_TIMER)
|
||||||
|
{
|
||||||
|
return m_totals[TIMER];
|
||||||
|
}
|
||||||
|
|
||||||
|
inline double getCpuTime()
|
||||||
{
|
{
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
LARGE_INTEGER query_ticks;
|
LARGE_INTEGER query_ticks;
|
||||||
@ -95,14 +123,42 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline double getRealTime()
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
SYSTEMTIME st;
|
||||||
|
GetSystemTime(&st);
|
||||||
|
return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
|
||||||
|
#else
|
||||||
|
struct timeval tv;
|
||||||
|
struct timezone tz;
|
||||||
|
gettimeofday(&tv, &tz);
|
||||||
|
return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||||
double m_frequency;
|
double m_frequency;
|
||||||
#endif
|
#endif
|
||||||
double m_best, m_start;
|
Vector2d m_starts;
|
||||||
|
Vector2d m_times;
|
||||||
|
Vector2d m_bests;
|
||||||
|
Vector2d m_totals;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define BENCH(TIMER,TRIES,REP,CODE) { \
|
||||||
|
TIMER.reset(); \
|
||||||
|
for(int uglyvarname1=0; uglyvarname1<TRIES; ++uglyvarname1){ \
|
||||||
|
TIMER.start(); \
|
||||||
|
for(int uglyvarname2=0; uglyvarname2<REP; ++uglyvarname2){ \
|
||||||
|
CODE; \
|
||||||
|
} \
|
||||||
|
TIMER.stop(); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_BENCH_TIMER_H
|
#endif // EIGEN_BENCH_TIMERR_H
|
||||||
|
@ -2,10 +2,11 @@
|
|||||||
|
|
||||||
# gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000"
|
# gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000"
|
||||||
# icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions"
|
# icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions"
|
||||||
|
CXX=${CXX-g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000} # default value
|
||||||
|
|
||||||
for ((i=1; i<16; ++i)); do
|
for ((i=1; i<16; ++i)); do
|
||||||
echo "Matrix size: $i x $i :"
|
echo "Matrix size: $i x $i :"
|
||||||
$CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=1024 -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null
|
$CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null
|
||||||
$CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null
|
$CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null
|
||||||
echo " "
|
echo " "
|
||||||
done
|
done
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
CXX=${CXX-g++} # default value unless caller has defined CXX
|
||||||
echo "Fixed size 3x3, column-major, -DNDEBUG"
|
echo "Fixed size 3x3, column-major, -DNDEBUG"
|
||||||
$CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null
|
$CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null
|
||||||
echo "Fixed size 3x3, column-major, with asserts"
|
echo "Fixed size 3x3, column-major, with asserts"
|
||||||
|
@ -178,9 +178,9 @@ class MatrixFunction<MatrixType, 1>
|
|||||||
*
|
*
|
||||||
* This is morally a \c static \c const \c Scalar, but only
|
* This is morally a \c static \c const \c Scalar, but only
|
||||||
* integers can be static constant class members in C++. The
|
* integers can be static constant class members in C++. The
|
||||||
* separation constant is set to 0.01, a value taken from the
|
* separation constant is set to 0.1, a value taken from the
|
||||||
* paper by Davies and Higham. */
|
* paper by Davies and Higham. */
|
||||||
static const RealScalar separation() { return static_cast<RealScalar>(0.01); }
|
static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** \brief Constructor.
|
/** \brief Constructor.
|
||||||
@ -492,14 +492,12 @@ typename MatrixFunction<MatrixType,1>::DynMatrixType MatrixFunction<MatrixType,1
|
|||||||
template<typename Derived> class MatrixFunctionReturnValue
|
template<typename Derived> class MatrixFunctionReturnValue
|
||||||
: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
|
: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
|
||||||
{
|
{
|
||||||
private:
|
public:
|
||||||
|
|
||||||
typedef typename ei_traits<Derived>::Scalar Scalar;
|
typedef typename ei_traits<Derived>::Scalar Scalar;
|
||||||
typedef typename ei_stem_function<Scalar>::type StemFunction;
|
typedef typename ei_stem_function<Scalar>::type StemFunction;
|
||||||
|
|
||||||
public:
|
/** \brief Constructor.
|
||||||
|
|
||||||
/** \brief Constructor.
|
|
||||||
*
|
*
|
||||||
* \param[in] A %Matrix (expression) forming the argument of the
|
* \param[in] A %Matrix (expression) forming the argument of the
|
||||||
* matrix function.
|
* matrix function.
|
||||||
|
@ -109,11 +109,10 @@ template<typename MatrixType>
|
|||||||
void testHyperbolicFunctions(const MatrixType& A)
|
void testHyperbolicFunctions(const MatrixType& A)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < g_repeat; i++) {
|
for (int i = 0; i < g_repeat; i++) {
|
||||||
MatrixType sinhA = ei_matrix_sinh(A);
|
|
||||||
MatrixType coshA = ei_matrix_cosh(A);
|
|
||||||
MatrixType expA = ei_matrix_exponential(A);
|
MatrixType expA = ei_matrix_exponential(A);
|
||||||
VERIFY_IS_APPROX(sinhA, (expA - expA.inverse())/2);
|
MatrixType expmA = ei_matrix_exponential(-A);
|
||||||
VERIFY_IS_APPROX(coshA, (expA + expA.inverse())/2);
|
VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2);
|
||||||
|
VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,14 +133,15 @@ void testGonioFunctions(const MatrixType& A)
|
|||||||
ComplexMatrix Ac = A.template cast<ComplexScalar>();
|
ComplexMatrix Ac = A.template cast<ComplexScalar>();
|
||||||
|
|
||||||
ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
|
ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
|
||||||
|
ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac);
|
||||||
|
|
||||||
MatrixType sinA = ei_matrix_sin(A);
|
MatrixType sinA = ei_matrix_sin(A);
|
||||||
ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
|
ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
|
||||||
VERIFY_IS_APPROX(sinAc, (exp_iA - exp_iA.inverse()) / (two*imagUnit));
|
VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
|
||||||
|
|
||||||
MatrixType cosA = ei_matrix_cos(A);
|
MatrixType cosA = ei_matrix_cos(A);
|
||||||
ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
|
ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
|
||||||
VERIFY_IS_APPROX(cosAc, (exp_iA + exp_iA.inverse()) / 2);
|
VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user