This commit is contained in:
Gael Guennebaud 2010-02-22 21:32:29 +01:00
commit 3beedba244
6 changed files with 358 additions and 166 deletions

View File

@ -27,6 +27,12 @@
#ifndef EIGEN_EXTERN_INSTANTIATIONS #ifndef EIGEN_EXTERN_INSTANTIATIONS
#ifdef EIGEN_HAS_FUSE_CJMADD
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
#else
#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
#endif
// optimized GEneral packed Block * packed Panel product kernel // optimized GEneral packed Block * packed Panel product kernel
template<typename Scalar, int mr, int nr, typename Conj> template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel struct ei_gebp_kernel
@ -74,65 +80,111 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<peeled_kc; k+=4) for(int k=0; k<peeled_kc; k+=4)
{ {
PacketType B0, B1, B2, B3, A0, A1; if(nr==2)
{
PacketType B0, T0, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); CJMADD(A1,B0,C4,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B0 = ei_pload(&blB[1*PacketSize]);
C4 = cj.pmadd(A1, B0, C4); CJMADD(A0,B0,C1,T0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); CJMADD(A1,B0,C5,T0);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[4*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
C0 = cj.pmadd(A0, B0, C0); A0 = ei_pload(&blA[2*PacketSize]);
C4 = cj.pmadd(A1, B0, C4); A1 = ei_pload(&blA[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]);
C1 = cj.pmadd(A0, B1, C1); CJMADD(A0,B0,C0,T0);
C5 = cj.pmadd(A1, B1, C5); CJMADD(A1,B0,C4,T0);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); B0 = ei_pload(&blB[3*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2); CJMADD(A0,B0,C1,T0);
if(nr==4) C6 = cj.pmadd(A1, B2, C6); CJMADD(A1,B0,C5,T0);
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3); A0 = ei_pload(&blA[4*PacketSize]);
A0 = ei_pload(&blA[6*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7); B0 = ei_pload(&blB[4*PacketSize]);
A1 = ei_pload(&blA[7*PacketSize]); CJMADD(A0,B0,C0,T0);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); CJMADD(A1,B0,C4,T0);
C0 = cj.pmadd(A0, B0, C0); B0 = ei_pload(&blB[5*PacketSize]);
C4 = cj.pmadd(A1, B0, C4); CJMADD(A0,B0,C1,T0);
C1 = cj.pmadd(A0, B1, C1); CJMADD(A1,B0,C5,T0);
C5 = cj.pmadd(A1, B1, C5);
if(nr==4) C2 = cj.pmadd(A0, B2, C2); A0 = ei_pload(&blA[6*PacketSize]);
if(nr==4) C6 = cj.pmadd(A1, B2, C6); A1 = ei_pload(&blA[7*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3); B0 = ei_pload(&blB[6*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0);
B0 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0);
}
else
{
PacketType B0, B1, B2, B3, A0, A1;
PacketType T0, T1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
CJMADD(A1,B0,C4,T1);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A1,B2,C6,T1); }
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); }
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); }
A1 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A1,B2,C6,T1); }
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); }
A0 = ei_pload(&blA[4*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); }
A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A1,B2,C6,T1); }
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); }
A0 = ei_pload(&blA[6*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); }
A1 = ei_pload(&blA[7*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1);
CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A1,B2,C6,T1); }
if(nr==4) { CJMADD(A0,B3,C3,T0); }
if(nr==4) { CJMADD(A1,B3,C7,T1); }
}
blB += 4*nr*PacketSize; blB += 4*nr*PacketSize;
blA += 4*mr; blA += 4*mr;
@ -140,22 +192,40 @@ struct ei_gebp_kernel
// process remaining peeled loop // process remaining peeled loop
for(int k=peeled_kc; k<depth; k++) for(int k=peeled_kc; k<depth; k++)
{ {
PacketType B0, B1, B2, B3, A0, A1; if(nr==2)
{
PacketType B0, T0, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); CJMADD(A1,B0,C4,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B0 = ei_pload(&blB[1*PacketSize]);
C4 = cj.pmadd(A1, B0, C4); CJMADD(A0,B0,C1,T0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); CJMADD(A1,B0,C5,T0);
C1 = cj.pmadd(A0, B1, C1); }
C5 = cj.pmadd(A1, B1, C5); else
if(nr==4) C2 = cj.pmadd(A0, B2, C2); {
if(nr==4) C6 = cj.pmadd(A1, B2, C6); PacketType B0, B1, B2, B3, A0, A1, T0, T1;
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
CJMADD(A1,B0,C4,T1);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A1,B2,C6,T1); }
if(nr==4) { CJMADD(A0,B3,C3,T0); }
if(nr==4) { CJMADD(A1,B3,C7,T1); }
}
blB += nr*PacketSize; blB += nr*PacketSize;
blA += mr; blA += mr;
@ -189,45 +259,79 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<peeled_kc; k+=4) for(int k=0; k<peeled_kc; k+=4)
{ {
PacketType B0, B1, B2, B3, A0; if(nr==2)
{
PacketType B0, T0, A0;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); B0 = ei_pload(&blB[1*PacketSize]);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); CJMADD(A0,B0,C1,T0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[1*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
C0 = cj.pmadd(A0, B0, C0); A0 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]);
C1 = cj.pmadd(A0, B1, C1); CJMADD(A0,B0,C0,T0);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); B0 = ei_pload(&blB[3*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2); CJMADD(A0,B0,C1,T0);
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3); A0 = ei_pload(&blA[2*PacketSize]);
A0 = ei_pload(&blA[3*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); B0 = ei_pload(&blB[5*PacketSize]);
C1 = cj.pmadd(A0, B1, C1); CJMADD(A0,B0,C1,T0);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C3 = cj.pmadd(A0, B3, C3); A0 = ei_pload(&blA[3*PacketSize]);
B0 = ei_pload(&blB[6*PacketSize]);
CJMADD(A0,B0,C0,T0);
B0 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C1,T0);
}
else
{
PacketType B0, B1, B2, B3, A0;
PacketType T0, T1;
A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
CJMADD(A0,B1,C1,T1);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); }
A0 = ei_pload(&blA[1*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C0,T0);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
CJMADD(A0,B1,C1,T1);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); }
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
CJMADD(A0,B0,C0,T0);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
CJMADD(A0,B1,C1,T1);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); }
A0 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
CJMADD(A0,B0,C0,T0);
CJMADD(A0,B1,C1,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A0,B3,C3,T1); }
}
blB += 4*nr*PacketSize; blB += 4*nr*PacketSize;
blA += 4*PacketSize; blA += 4*PacketSize;
@ -235,17 +339,32 @@ struct ei_gebp_kernel
// process remaining peeled loop // process remaining peeled loop
for(int k=peeled_kc; k<depth; k++) for(int k=peeled_kc; k<depth; k++)
{ {
PacketType B0, B1, B2, B3, A0; if(nr==2)
{
PacketType B0, T0, A0;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); B0 = ei_pload(&blB[1*PacketSize]);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); CJMADD(A0,B0,C1,T0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); }
C1 = cj.pmadd(A0, B1, C1); else
if(nr==4) C2 = cj.pmadd(A0, B2, C2); {
if(nr==4) C3 = cj.pmadd(A0, B3, C3); PacketType B0, B1, B2, B3, A0;
PacketType T0, T1;
A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
CJMADD(A0,B0,C0,T0);
CJMADD(A0,B1,C1,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A0,B3,C3,T1); }
}
blB += nr*PacketSize; blB += nr*PacketSize;
blA += PacketSize; blA += PacketSize;
@ -268,17 +387,32 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
Scalar B0, B1, B2, B3, A0; if(nr==2)
{
Scalar B0, T0, A0;
A0 = blA[k]; A0 = blA[0*PacketSize];
B0 = blB[0*PacketSize]; B0 = blB[0*PacketSize];
B1 = blB[1*PacketSize]; CJMADD(A0,B0,C0,T0);
C0 = cj.pmadd(A0, B0, C0); B0 = blB[1*PacketSize];
if(nr==4) B2 = blB[2*PacketSize]; CJMADD(A0,B0,C1,T0);
if(nr==4) B3 = blB[3*PacketSize]; }
C1 = cj.pmadd(A0, B1, C1); else
if(nr==4) C2 = cj.pmadd(A0, B2, C2); {
if(nr==4) C3 = cj.pmadd(A0, B3, C3); Scalar B0, B1, B2, B3, A0;
Scalar T0, T1;
A0 = blA[k];
B0 = blB[0*PacketSize];
B1 = blB[1*PacketSize];
if(nr==4) B2 = blB[2*PacketSize];
if(nr==4) B3 = blB[3*PacketSize];
CJMADD(A0,B0,C0,T0);
CJMADD(A0,B1,C1,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) { CJMADD(A0,B3,C3,T1); }
}
blB += nr*PacketSize; blB += nr*PacketSize;
} }
@ -310,13 +444,13 @@ struct ei_gebp_kernel
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
PacketType B0, A0, A1; PacketType B0, A0, A1, T0, T1;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
C0 = cj.pmadd(A0, B0, C0); CJMADD(A0,B0,C0,T0);
C4 = cj.pmadd(A1, B0, C4); CJMADD(A1,B0,C4,T1);
blB += PacketSize; blB += PacketSize;
blA += mr; blA += mr;
@ -363,6 +497,8 @@ struct ei_gebp_kernel
} }
}; };
#undef CJMADD
// pack a block of the lhs // pack a block of the lhs
// The travesal is as follow (mr==4): // The travesal is as follow (mr==4):
// 0 4 8 12 ... // 0 4 8 12 ...

View File

@ -23,24 +23,31 @@
// License and a copy of the GNU General Public License along with // License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>. // Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_BENCH_TIMER_H #ifndef EIGEN_BENCH_TIMERR_H
#define EIGEN_BENCH_TIMER_H #define EIGEN_BENCH_TIMERR_H
#if defined(_WIN32) || defined(__CYGWIN__) #if defined(_WIN32) || defined(__CYGWIN__)
#define NOMINMAX #define NOMINMAX
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#include <windows.h> #include <windows.h>
#else #else
#include <sys/time.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#include <cmath>
#include <cstdlib> #include <cstdlib>
#include <numeric> #include <numeric>
namespace Eigen namespace Eigen
{ {
enum {
CPU_TIMER = 0,
REAL_TIMER = 1
};
/** Elapsed time timer keeping the best try. /** Elapsed time timer keeping the best try.
* *
* On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID. * On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID.
@ -64,25 +71,46 @@ public:
~BenchTimer() {} ~BenchTimer() {}
inline void reset(void) {m_best = 1e6;} inline void reset()
inline void start(void) {m_start = getTime();}
inline void stop(void)
{ {
m_best = std::min(m_best, getTime() - m_start); m_bests.fill(1e9);
m_totals.setZero();
}
inline void start()
{
m_starts[CPU_TIMER] = getCpuTime();
m_starts[REAL_TIMER] = getRealTime();
}
inline void stop()
{
m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER];
m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER];
m_bests = m_bests.cwiseMin(m_times);
m_totals += m_times;
} }
/** Return the best elapsed time in seconds. /** Return the elapsed time in seconds between the last start/stop pair
*/ */
inline double value(void) inline double value(int TIMER = CPU_TIMER)
{ {
return m_best; return m_times[TIMER];
} }
#if defined(_WIN32) || defined(__CYGWIN__) /** Return the best elapsed time in seconds
inline double getTime(void) */
#else inline double best(int TIMER = CPU_TIMER)
static inline double getTime(void) {
#endif return m_bests[TIMER];
}
/** Return the total elapsed time in seconds.
*/
inline double total(int TIMER = CPU_TIMER)
{
return m_totals[TIMER];
}
inline double getCpuTime()
{ {
#ifdef WIN32 #ifdef WIN32
LARGE_INTEGER query_ticks; LARGE_INTEGER query_ticks;
@ -95,14 +123,42 @@ public:
#endif #endif
} }
inline double getRealTime()
{
#ifdef WIN32
SYSTEMTIME st;
GetSystemTime(&st);
return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
#else
struct timeval tv;
struct timezone tz;
gettimeofday(&tv, &tz);
return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
#endif
}
protected: protected:
#if defined(_WIN32) || defined(__CYGWIN__) #if defined(_WIN32) || defined(__CYGWIN__)
double m_frequency; double m_frequency;
#endif #endif
double m_best, m_start; Vector2d m_starts;
Vector2d m_times;
Vector2d m_bests;
Vector2d m_totals;
}; };
#define BENCH(TIMER,TRIES,REP,CODE) { \
TIMER.reset(); \
for(int uglyvarname1=0; uglyvarname1<TRIES; ++uglyvarname1){ \
TIMER.start(); \
for(int uglyvarname2=0; uglyvarname2<REP; ++uglyvarname2){ \
CODE; \
} \
TIMER.stop(); \
} \
}
} }
#endif // EIGEN_BENCH_TIMER_H #endif // EIGEN_BENCH_TIMERR_H

View File

@ -2,10 +2,11 @@
# gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000" # gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000"
# icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions" # icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions"
CXX=${CXX-g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000} # default value
for ((i=1; i<16; ++i)); do for ((i=1; i<16; ++i)); do
echo "Matrix size: $i x $i :" echo "Matrix size: $i x $i :"
$CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=1024 -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null
$CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null $CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null
echo " " echo " "
done done

View File

@ -1,4 +1,5 @@
#!/bin/bash #!/bin/bash
CXX=${CXX-g++} # default value unless caller has defined CXX
echo "Fixed size 3x3, column-major, -DNDEBUG" echo "Fixed size 3x3, column-major, -DNDEBUG"
$CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null $CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null
echo "Fixed size 3x3, column-major, with asserts" echo "Fixed size 3x3, column-major, with asserts"

View File

@ -178,9 +178,9 @@ class MatrixFunction<MatrixType, 1>
* *
* This is morally a \c static \c const \c Scalar, but only * This is morally a \c static \c const \c Scalar, but only
* integers can be static constant class members in C++. The * integers can be static constant class members in C++. The
* separation constant is set to 0.01, a value taken from the * separation constant is set to 0.1, a value taken from the
* paper by Davies and Higham. */ * paper by Davies and Higham. */
static const RealScalar separation() { return static_cast<RealScalar>(0.01); } static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
}; };
/** \brief Constructor. /** \brief Constructor.
@ -492,14 +492,12 @@ typename MatrixFunction<MatrixType,1>::DynMatrixType MatrixFunction<MatrixType,1
template<typename Derived> class MatrixFunctionReturnValue template<typename Derived> class MatrixFunctionReturnValue
: public ReturnByValue<MatrixFunctionReturnValue<Derived> > : public ReturnByValue<MatrixFunctionReturnValue<Derived> >
{ {
private: public:
typedef typename ei_traits<Derived>::Scalar Scalar; typedef typename ei_traits<Derived>::Scalar Scalar;
typedef typename ei_stem_function<Scalar>::type StemFunction; typedef typename ei_stem_function<Scalar>::type StemFunction;
public: /** \brief Constructor.
/** \brief Constructor.
* *
* \param[in] A %Matrix (expression) forming the argument of the * \param[in] A %Matrix (expression) forming the argument of the
* matrix function. * matrix function.

View File

@ -109,11 +109,10 @@ template<typename MatrixType>
void testHyperbolicFunctions(const MatrixType& A) void testHyperbolicFunctions(const MatrixType& A)
{ {
for (int i = 0; i < g_repeat; i++) { for (int i = 0; i < g_repeat; i++) {
MatrixType sinhA = ei_matrix_sinh(A);
MatrixType coshA = ei_matrix_cosh(A);
MatrixType expA = ei_matrix_exponential(A); MatrixType expA = ei_matrix_exponential(A);
VERIFY_IS_APPROX(sinhA, (expA - expA.inverse())/2); MatrixType expmA = ei_matrix_exponential(-A);
VERIFY_IS_APPROX(coshA, (expA + expA.inverse())/2); VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2);
VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2);
} }
} }
@ -134,14 +133,15 @@ void testGonioFunctions(const MatrixType& A)
ComplexMatrix Ac = A.template cast<ComplexScalar>(); ComplexMatrix Ac = A.template cast<ComplexScalar>();
ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac); ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac);
MatrixType sinA = ei_matrix_sin(A); MatrixType sinA = ei_matrix_sin(A);
ComplexMatrix sinAc = sinA.template cast<ComplexScalar>(); ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
VERIFY_IS_APPROX(sinAc, (exp_iA - exp_iA.inverse()) / (two*imagUnit)); VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
MatrixType cosA = ei_matrix_cos(A); MatrixType cosA = ei_matrix_cos(A);
ComplexMatrix cosAc = cosA.template cast<ComplexScalar>(); ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
VERIFY_IS_APPROX(cosAc, (exp_iA + exp_iA.inverse()) / 2); VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2);
} }
} }