mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 11:49:02 +08:00
significant speedup in the matrix-matrix products
This commit is contained in:
parent
d579d4cc37
commit
eb905500b6
@ -184,11 +184,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) {
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
|
||||
|
||||
#if (!defined __GNUC__) && (!defined __ICC)
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
|
||||
#else
|
||||
// #if (!defined __GNUC__) && (!defined __ICC)
|
||||
// template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
|
||||
// template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
||||
// template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
|
||||
// #else
|
||||
|
||||
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
|
||||
// require pointer casting to incompatible pointer types and leads to invalid code
|
||||
// because of the strict aliasing rule. The "dummy" stuff are required to enforce
|
||||
@ -197,28 +198,27 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
__m128 res;
|
||||
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
|
||||
asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
|
||||
return res;
|
||||
__m128d res;
|
||||
res = _mm_load_sd((const double*)(from)) ;
|
||||
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
|
||||
return _mm_castpd_ps(res);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
__m128d res;
|
||||
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) );
|
||||
asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) );
|
||||
res = _mm_load_sd(from) ;
|
||||
res = _mm_loadh_pd(res,from+1);
|
||||
return res;
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
__m128i res;
|
||||
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
|
||||
asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
|
||||
return res;
|
||||
__m128d res;
|
||||
res = _mm_load_sd((const double*)(from)) ;
|
||||
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
|
||||
return _mm_castpd_si128(res);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
||||
@ -277,6 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs)
|
||||
{
|
||||
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
|
||||
vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
|
||||
vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
|
||||
vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
|
||||
}
|
||||
|
||||
#ifdef __SSE3__
|
||||
// TODO implement SSE2 versions as well as integer versions
|
||||
|
@ -30,7 +30,7 @@
|
||||
#ifdef EIGEN_HAS_FUSE_CJMADD
|
||||
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
|
||||
#else
|
||||
#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
|
||||
#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
|
||||
#endif
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
@ -48,9 +48,66 @@ struct ei_gebp_kernel
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
|
||||
const int peeled_kc = (depth/4)*4;
|
||||
|
||||
Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
|
||||
|
||||
// loops on each micro vertical panel of rhs (depth x nr)
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// unpack B
|
||||
{
|
||||
const Scalar* blB = &blockB[j2*strideB+offsetB*nr];
|
||||
int n = depth*nr;
|
||||
for(int k=0; k<n; k++)
|
||||
ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
|
||||
/*Scalar* dest = unpackedB;
|
||||
for(int k=0; k<n; k+=4*PacketSize)
|
||||
{
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
const int S = 128;
|
||||
const int G = 16;
|
||||
_mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
|
||||
C0[0] = ei_pload(blB+0*PacketSize);
|
||||
C1[0] = ei_pload(blB+1*PacketSize);
|
||||
C2[0] = ei_pload(blB+2*PacketSize);
|
||||
C3[0] = ei_pload(blB+3*PacketSize);
|
||||
|
||||
ei_punpackp(C0);
|
||||
ei_punpackp(C1);
|
||||
ei_punpackp(C2);
|
||||
ei_punpackp(C3);
|
||||
|
||||
ei_pstore(dest+ 0*PacketSize, C0[0]);
|
||||
ei_pstore(dest+ 1*PacketSize, C0[1]);
|
||||
ei_pstore(dest+ 2*PacketSize, C0[2]);
|
||||
ei_pstore(dest+ 3*PacketSize, C0[3]);
|
||||
|
||||
ei_pstore(dest+ 4*PacketSize, C1[0]);
|
||||
ei_pstore(dest+ 5*PacketSize, C1[1]);
|
||||
ei_pstore(dest+ 6*PacketSize, C1[2]);
|
||||
ei_pstore(dest+ 7*PacketSize, C1[3]);
|
||||
|
||||
ei_pstore(dest+ 8*PacketSize, C2[0]);
|
||||
ei_pstore(dest+ 9*PacketSize, C2[1]);
|
||||
ei_pstore(dest+10*PacketSize, C2[2]);
|
||||
ei_pstore(dest+11*PacketSize, C2[3]);
|
||||
|
||||
ei_pstore(dest+12*PacketSize, C3[0]);
|
||||
ei_pstore(dest+13*PacketSize, C3[1]);
|
||||
ei_pstore(dest+14*PacketSize, C3[2]);
|
||||
ei_pstore(dest+15*PacketSize, C3[3]);
|
||||
|
||||
blB += 4*PacketSize;
|
||||
dest += 16*PacketSize;
|
||||
}*/
|
||||
}
|
||||
// loops on each micro horizontal panel of lhs (mr x depth)
|
||||
// => we select a mr x nr micro block of res which is entirely
|
||||
// stored into mr/packet_size x nr registers.
|
||||
@ -65,19 +122,31 @@ struct ei_gebp_kernel
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
|
||||
C0 = ei_pset1(Scalar(0));
|
||||
C1 = ei_pset1(Scalar(0));
|
||||
if(nr==4) C2 = ei_pset1(Scalar(0));
|
||||
if(nr==4) C3 = ei_pset1(Scalar(0));
|
||||
C4 = ei_pset1(Scalar(0));
|
||||
C5 = ei_pset1(Scalar(0));
|
||||
if(nr==4) C6 = ei_pset1(Scalar(0));
|
||||
if(nr==4) C7 = ei_pset1(Scalar(0));
|
||||
|
||||
Scalar* r0 = &res[(j2+0)*resStride + i];
|
||||
Scalar* r1 = r0 + resStride;
|
||||
Scalar* r2 = r1 + resStride;
|
||||
Scalar* r3 = r2 + resStride;
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(r0+16), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(r1+16), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(r2+16), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(r3+16), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
const Scalar* blB = unpackedB;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
if(nr==2)
|
||||
@ -88,102 +157,101 @@ struct ei_gebp_kernel
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[1*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A1,B0,C5,T0);
|
||||
CJMADD(A1,B0,C5,B0);
|
||||
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[3*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A1,B0,C5,T0);
|
||||
CJMADD(A1,B0,C5,B0);
|
||||
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
B0 = ei_pload(&blB[4*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[5*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A1,B0,C5,T0);
|
||||
CJMADD(A1,B0,C5,B0);
|
||||
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
B0 = ei_pload(&blB[6*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A1,B0,C5,T0);
|
||||
CJMADD(A1,B0,C5,B0);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
PacketType T0, T1;
|
||||
PacketType T0;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B2 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[4*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,B1);
|
||||
B1 = ei_pload(&blB[5*PacketSize]);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A1,B2,C6,B2);
|
||||
B2 = ei_pload(&blB[6*PacketSize]);
|
||||
CJMADD(A0,B3,C3,T0);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
CJMADD(A1,B3,C7,B3);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
B3 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[8*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,B1);
|
||||
B1 = ei_pload(&blB[9*PacketSize]);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A1,B2,C6,B2);
|
||||
B2 = ei_pload(&blB[10*PacketSize]);
|
||||
CJMADD(A0,B3,C3,T0);
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
CJMADD(A1,B3,C7,B3);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A1,B0,C4,T1);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T1);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T1);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T1);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,T1);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[12*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,B1);
|
||||
B1 = ei_pload(&blB[13*PacketSize]);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A1,B2,C6,B2);
|
||||
B2 = ei_pload(&blB[14*PacketSize]);
|
||||
CJMADD(A0,B3,C3,T0);
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
CJMADD(A1,B3,C7,B3);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
B3 = ei_pload(&blB[15*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,B1);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A1,B2,C6,B2);
|
||||
CJMADD(A0,B3,C3,T0);
|
||||
CJMADD(A1,B3,C7,B3);
|
||||
}
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
@ -200,45 +268,64 @@ struct ei_gebp_kernel
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A1,B0,C4,T0);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B0 = ei_pload(&blB[1*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A1,B0,C5,T0);
|
||||
CJMADD(A1,B0,C5,B0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1, T0, T1;
|
||||
PacketType B0, B1, B2, B3, A0, A1, T0;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A1,B0,C4,T1);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,T1);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A1,B2,C6,T1); }
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T0); }
|
||||
if(nr==4) { CJMADD(A1,B3,C7,T1); }
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B2 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A1,B0,C4,B0);
|
||||
B3 = ei_pload(&blB[3*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T0);
|
||||
CJMADD(A1,B1,C5,B1);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A1,B2,C6,B2);
|
||||
CJMADD(A0,B3,C3,T0);
|
||||
CJMADD(A1,B3,C7,B3);
|
||||
}
|
||||
|
||||
blB += nr*PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
|
||||
PacketType R0, R1, R2, R3, R4, R5, R6, R7;
|
||||
|
||||
R0 = ei_ploadu(r0);
|
||||
R1 = ei_ploadu(r1);
|
||||
if(nr==4) R2 = ei_ploadu(r2);
|
||||
if(nr==4) R3 = ei_ploadu(r3);
|
||||
R4 = ei_ploadu(r0 + PacketSize);
|
||||
R5 = ei_ploadu(r1 + PacketSize);
|
||||
if(nr==4) R6 = ei_ploadu(r2 + PacketSize);
|
||||
if(nr==4) R7 = ei_ploadu(r3 + PacketSize);
|
||||
|
||||
C0 = ei_padd(R0, C0);
|
||||
C1 = ei_padd(R1, C1);
|
||||
if(nr==4) C2 = ei_padd(R2, C2);
|
||||
if(nr==4) C3 = ei_padd(R3, C3);
|
||||
C4 = ei_padd(R4, C4);
|
||||
C5 = ei_padd(R5, C5);
|
||||
if(nr==4) C6 = ei_padd(R6, C6);
|
||||
if(nr==4) C7 = ei_padd(R7, C7);
|
||||
|
||||
ei_pstoreu(r0, C0);
|
||||
ei_pstoreu(r1, C1);
|
||||
if(nr==4) ei_pstoreu(r2, C2);
|
||||
if(nr==4) ei_pstoreu(r3, C3);
|
||||
ei_pstoreu(r0 + PacketSize, C4);
|
||||
ei_pstoreu(r1 + PacketSize, C5);
|
||||
if(nr==4) ei_pstoreu(r2 + PacketSize, C6);
|
||||
if(nr==4) ei_pstoreu(r3 + PacketSize, C7);
|
||||
}
|
||||
if(rows-peeled_mc>=PacketSize)
|
||||
{
|
||||
@ -256,81 +343,76 @@ struct ei_gebp_kernel
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
|
||||
// performs "inner" product
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
const Scalar* blB = unpackedB;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
if(nr==2)
|
||||
{
|
||||
PacketType B0, T0, A0;
|
||||
PacketType B0, B1, A0;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[1*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
|
||||
A0 = ei_pload(&blA[1*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B0 = ei_pload(&blB[2*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[3*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
A0 = ei_pload(&blA[1*PacketSize]);
|
||||
B1 = ei_pload(&blB[3*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B0 = ei_pload(&blB[4*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[5*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
|
||||
A0 = ei_pload(&blA[3*PacketSize]);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
B1 = ei_pload(&blB[5*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B0 = ei_pload(&blB[6*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C1,T0);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
A0 = ei_pload(&blA[3*PacketSize]);
|
||||
B1 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
PacketType B0, B1, B2, B3, A0;
|
||||
PacketType T0, T1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
A0 = ei_pload(&blA[1*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B2 = ei_pload(&blB[2*PacketSize]);
|
||||
B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[4*PacketSize]);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
B1 = ei_pload(&blB[5*PacketSize]);
|
||||
CJMADD(A0,B2,C2,B2);
|
||||
B2 = ei_pload(&blB[6*PacketSize]);
|
||||
CJMADD(A0,B3,C3,B3);
|
||||
A0 = ei_pload(&blA[1*PacketSize]);
|
||||
B3 = ei_pload(&blB[7*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B0 = ei_pload(&blB[8*PacketSize]);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
B1 = ei_pload(&blB[9*PacketSize]);
|
||||
CJMADD(A0,B2,C2,B2);
|
||||
B2 = ei_pload(&blB[10*PacketSize]);
|
||||
CJMADD(A0,B3,C3,B3);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
A0 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
B0 = ei_pload(&blB[12*PacketSize]);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
B1 = ei_pload(&blB[13*PacketSize]);
|
||||
CJMADD(A0,B2,C2,B2);
|
||||
B2 = ei_pload(&blB[14*PacketSize]);
|
||||
CJMADD(A0,B3,C3,B3);
|
||||
A0 = ei_pload(&blA[3*PacketSize]);
|
||||
B3 = ei_pload(&blB[15*PacketSize]);
|
||||
CJMADD(A0,B0,C0,B0);
|
||||
CJMADD(A0,B1,C1,B1);
|
||||
CJMADD(A0,B2,C2,B2);
|
||||
CJMADD(A0,B3,C3,B3);
|
||||
}
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
@ -354,16 +436,16 @@ struct ei_gebp_kernel
|
||||
PacketType B0, B1, B2, B3, A0;
|
||||
PacketType T0, T1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
B2 = ei_pload(&blB[2*PacketSize]);
|
||||
B3 = ei_pload(&blB[3*PacketSize]);
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A0,B3,C3,T1);
|
||||
}
|
||||
|
||||
blB += nr*PacketSize;
|
||||
@ -384,7 +466,8 @@ struct ei_gebp_kernel
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
// TODO directly use blockB ???
|
||||
const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
if(nr==2)
|
||||
@ -402,16 +485,16 @@ struct ei_gebp_kernel
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
Scalar T0, T1;
|
||||
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
if(nr==4) B2 = blB[2*PacketSize];
|
||||
if(nr==4) B3 = blB[3*PacketSize];
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
B2 = blB[2*PacketSize];
|
||||
B3 = blB[3*PacketSize];
|
||||
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
if(nr==4) { CJMADD(A0,B2,C2,T0); }
|
||||
if(nr==4) { CJMADD(A0,B3,C3,T1); }
|
||||
CJMADD(A0,B0,C0,T0);
|
||||
CJMADD(A0,B1,C1,T1);
|
||||
CJMADD(A0,B2,C2,T0);
|
||||
CJMADD(A0,B3,C3,T1);
|
||||
}
|
||||
|
||||
blB += nr*PacketSize;
|
||||
@ -427,6 +510,13 @@ struct ei_gebp_kernel
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
{
|
||||
// unpack B
|
||||
{
|
||||
const Scalar* blB = &blockB[j2*strideB+offsetB];
|
||||
for(int k=0; k<depth; k++)
|
||||
ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
|
||||
}
|
||||
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||
@ -436,12 +526,12 @@ struct ei_gebp_kernel
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
// get res block as registers
|
||||
PacketType C0, C4;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
const Scalar* blB = unpackedB;
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
PacketType B0, A0, A1, T0, T1;
|
||||
@ -469,7 +559,7 @@ struct ei_gebp_kernel
|
||||
|
||||
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
const Scalar* blB = unpackedB;
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0);
|
||||
@ -488,7 +578,8 @@ struct ei_gebp_kernel
|
||||
|
||||
// gets a 1 x 1 res block as registers
|
||||
Scalar C0(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
// FIXME directly use blockB ??
|
||||
const Scalar* blB = unpackedB;
|
||||
for(int k=0; k<depth; k++)
|
||||
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
@ -552,9 +643,9 @@ struct ei_gemm_pack_lhs
|
||||
}
|
||||
};
|
||||
|
||||
// copy a complete panel of the rhs while expending each coefficient into a packet form
|
||||
// copy a complete panel of the rhs
|
||||
// this version is optimized for column major matrices
|
||||
// The traversal order is as follow (nr==4):
|
||||
// The traversal order is as follow: (nr==4):
|
||||
// 0 1 2 3 12 13 14 15 24 27
|
||||
// 4 5 6 7 16 17 18 19 25 28
|
||||
// 8 9 10 11 20 21 22 23 26 29
|
||||
@ -574,65 +665,51 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
if(PanelMode) count += nr * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
blockB[count+0] = alpha*b0[k];
|
||||
blockB[count+1] = alpha*b1[k];
|
||||
if(nr==4) blockB[count+2] = alpha*b2[k];
|
||||
if(nr==4) blockB[count+3] = alpha*b3[k];
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
blockB[count+0] = b0[k];
|
||||
blockB[count+1] = b1[k];
|
||||
if(nr==4) blockB[count+2] = b2[k];
|
||||
if(nr==4) blockB[count+3] = b3[k];
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
if(PanelMode) count += nr * (stride-offset-depth);
|
||||
}
|
||||
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
if(PanelMode) count += offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
|
||||
count += PacketSize;
|
||||
blockB[count] = alpha*b0[k];
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(b0[k]));
|
||||
count += PacketSize;
|
||||
blockB[count] = b0[k];
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
if(PanelMode) count += (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -652,17 +729,17 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
if(PanelMode) count += nr * offset;
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
||||
count += nr*PacketSize;
|
||||
blockB[count+0] = alpha*b0[0];
|
||||
blockB[count+1] = alpha*b0[1];
|
||||
if(nr==4) blockB[count+2] = alpha*b0[2];
|
||||
if(nr==4) blockB[count+3] = alpha*b0[3];
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -670,27 +747,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
||||
count += nr*PacketSize;
|
||||
blockB[count+0] = b0[0];
|
||||
blockB[count+1] = b0[1];
|
||||
if(nr==4) blockB[count+2] = b0[2];
|
||||
if(nr==4) blockB[count+3] = b0[3];
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
if(PanelMode) count += nr * (stride-offset-depth);
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
if(PanelMode) count += offset;
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||
count += PacketSize;
|
||||
blockB[count] = alpha*b0[k*rhsStride];
|
||||
count += 1;
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
if(PanelMode) count += stride-offset-depth;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -78,8 +78,10 @@ static void run(int rows, int cols, int depth,
|
||||
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc*8);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
@ -111,7 +113,7 @@ static void run(int rows, int cols, int depth,
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -95,14 +95,14 @@ struct ei_symm_pack_rhs
|
||||
{
|
||||
for(int k=k2; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
|
||||
blockB[count+0] = alpha*rhs(k,j2+0);
|
||||
blockB[count+1] = alpha*rhs(k,j2+1);
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
|
||||
blockB[count+2] = alpha*rhs(k,j2+2);
|
||||
blockB[count+3] = alpha*rhs(k,j2+3);
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -113,14 +113,14 @@ struct ei_symm_pack_rhs
|
||||
// transpose
|
||||
for(int k=k2; k<j2; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k))));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k))));
|
||||
blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
|
||||
blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k))));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k))));
|
||||
blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
|
||||
blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
count += nr;
|
||||
}
|
||||
// symmetric
|
||||
int h = 0;
|
||||
@ -128,24 +128,24 @@ struct ei_symm_pack_rhs
|
||||
{
|
||||
// normal
|
||||
for (int w=0 ; w<h; ++w)
|
||||
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w)));
|
||||
blockB[count+w] = alpha*rhs(k,j2+w);
|
||||
// transpose
|
||||
for (int w=h ; w<nr; ++w)
|
||||
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+w,k))));
|
||||
count += nr*PacketSize;
|
||||
blockB[count+w] = alpha*ei_conj(rhs(j2+w,k));
|
||||
count += nr;
|
||||
++h;
|
||||
}
|
||||
// normal
|
||||
for(int k=j2+nr; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
|
||||
blockB[count+0] = alpha*rhs(k,j2+0);
|
||||
blockB[count+1] = alpha*rhs(k,j2+1);
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
|
||||
blockB[count+2] = alpha*rhs(k,j2+2);
|
||||
blockB[count+3] = alpha*rhs(k,j2+3);
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -154,14 +154,14 @@ struct ei_symm_pack_rhs
|
||||
{
|
||||
for(int k=k2; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k))));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k))));
|
||||
blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
|
||||
blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k))));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k))));
|
||||
blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
|
||||
blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -172,14 +172,14 @@ struct ei_symm_pack_rhs
|
||||
int half = std::min(end_k,j2);
|
||||
for(int k=k2; k<half; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*ei_conj(rhs(j2,k))));
|
||||
count += PacketSize;
|
||||
blockB[count] = alpha*ei_conj(rhs(j2,k));
|
||||
count += 1;
|
||||
}
|
||||
// normal
|
||||
for(int k=half; k<k2+rows; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
|
||||
count += PacketSize;
|
||||
blockB[count] = alpha*rhs(k,j2);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -244,7 +244,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
@ -292,7 +294,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
};
|
||||
|
||||
@ -323,7 +325,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
@ -346,7 +350,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -120,7 +120,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
// Scalar* allocatedBlockB = new Scalar[sizeB];
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
|
||||
|
||||
// => GEBP with the micro triangular block
|
||||
// The trick is to pack this micro block while filling the opposite triangular part with zeros.
|
||||
// To this end we do an extra triangular copy to small temporary buffer
|
||||
// To this end we do an extra triangular copy to a small temporary buffer
|
||||
for (int k=0;k<actualPanelWidth;++k)
|
||||
{
|
||||
if (!(Mode&UnitDiag))
|
||||
@ -166,7 +169,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
|
||||
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth);
|
||||
|
||||
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
|
||||
// GEBP with remaining micro panel
|
||||
if (lengthTarget>0)
|
||||
@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
|
||||
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -196,7 +199,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
// delete[] allocatedBlockB;
|
||||
}
|
||||
};
|
||||
|
||||
@ -234,7 +238,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
const int actual_kc = std::min(IsLower ? size-k2 : k2, kc);
|
||||
int actual_k2 = IsLower ? k2 : k2-actual_kc;
|
||||
int rs = IsLower ? actual_k2 : size - k2;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc/**Blocking::PacketSize*/;
|
||||
|
||||
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
|
||||
|
||||
@ -265,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
int panelOffset = IsLower ? j2+actualPanelWidth : 0;
|
||||
int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
|
||||
// general part
|
||||
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
|
||||
pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
@ -279,7 +285,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
|
||||
}
|
||||
|
||||
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
|
||||
pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
|
||||
triangularBuffer.data(), triangularBuffer.stride(), alpha,
|
||||
actualPanelWidth, actualPanelWidth,
|
||||
actual_kc, j2);
|
||||
@ -300,10 +306,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
int blockOffset = IsLower ? j2 : 0;
|
||||
|
||||
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
|
||||
blockA, blockB+j2*actual_kc*Blocking::PacketSize,
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
actual_kc, actual_kc, // strides
|
||||
blockOffset, blockOffset*Blocking::PacketSize);// offsets
|
||||
blockOffset, blockOffset);// offsets
|
||||
}
|
||||
}
|
||||
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
||||
@ -312,7 +318,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -67,7 +67,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
|
||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
ei_conj_if<Conjugate> conj;
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel;
|
||||
@ -146,7 +148,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
|
||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -169,7 +171,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
};
|
||||
|
||||
@ -198,7 +200,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
|
||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
|
||||
|
||||
ei_conj_if<Conjugate> conj;
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel;
|
||||
@ -215,7 +219,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
|
||||
|
||||
int startPanel = IsLower ? 0 : k2+actual_kc;
|
||||
int rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc;
|
||||
|
||||
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs);
|
||||
|
||||
@ -230,7 +234,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
|
||||
int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
|
||||
|
||||
if (panelLength>0)
|
||||
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), triStride, -1,
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
@ -260,10 +264,10 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
|
||||
if(panelLength>0)
|
||||
{
|
||||
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
|
||||
blockA, blockB+j2*actual_kc*Blocking::PacketSize,
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
actual_kc, actual_kc, // strides
|
||||
panelOffset, panelOffset*Blocking::PacketSize); // offsets
|
||||
panelOffset, panelOffset); // offsets
|
||||
}
|
||||
|
||||
// unblocked triangular solve
|
||||
@ -298,7 +302,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -232,7 +232,7 @@ using Eigen::ei_cos;
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_STACK_ALLOCATION_LIMIT
|
||||
#define EIGEN_STACK_ALLOCATION_LIMIT 1000000
|
||||
#define EIGEN_STACK_ALLOCATION_LIMIT 20000
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_DEFAULT_IO_FORMAT
|
||||
|
@ -22,8 +22,8 @@ void gemm(const M& a, const M& b, M& c)
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
int rep = 2;
|
||||
int s = 1024;
|
||||
int rep = 1;
|
||||
int s = 2048;
|
||||
int m = s;
|
||||
int n = s;
|
||||
int p = s;
|
||||
|
@ -31,7 +31,22 @@ static int intone = 1;
|
||||
|
||||
void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
|
||||
{
|
||||
// cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, c.rows(), c.cols(), a.cols(), 1, a.data(), a.rows(), b.data(), b.rows(), 1, c.data(), c.rows());
|
||||
int M = c.rows();
|
||||
int N = c.cols();
|
||||
int K = a.cols();
|
||||
|
||||
int lda = a.rows();
|
||||
int ldb = b.rows();
|
||||
int ldc = c.rows();
|
||||
|
||||
sgemm_(¬rans,¬rans,&M,&N,&K,&fone,
|
||||
const_cast<float*>(a.data()),&lda,
|
||||
const_cast<float*>(b.data()),&ldb,&fone,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
|
||||
{
|
||||
int M = c.rows();
|
||||
int N = c.cols();
|
||||
int K = a.cols();
|
||||
@ -40,16 +55,16 @@ void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
|
||||
int ldb = b.rows();
|
||||
int ldc = c.rows();
|
||||
|
||||
sgemm_(¬rans,¬rans,&M,&N,&K,&fone,
|
||||
const_cast<float*>(a.data()),&lda,
|
||||
const_cast<float*>(b.data()),&ldb,&fzero,
|
||||
dgemm_(¬rans,¬rans,&M,&N,&K,&done,
|
||||
const_cast<double*>(a.data()),&lda,
|
||||
const_cast<double*>(b.data()),&ldb,&done,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int rep = 2;
|
||||
int s = 1024;
|
||||
int rep = 1;
|
||||
int s = 2048;
|
||||
int m = s;
|
||||
int n = s;
|
||||
int p = s;
|
||||
|
Loading…
x
Reference in New Issue
Block a user