mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 11:49:02 +08:00
replaced _mm_prefetch in GeneralBlockPanelKernel.h, with ei_prefetch() inline function.
Implemented NEON and AltiVec versions, copied SSE version over from GeneralBlockPanelKernel.h. Also in GCC case (or rather !_MSC_VER) it's implemented using __builtin_prefetch(). NEON managed to give a small but welcome boost, 0.88GFLOPS -> 0.91GFLOPS.
This commit is contained in:
parent
e3e34b5920
commit
6972c140f7
@ -169,6 +169,14 @@ template<typename Scalar, typename Packet> inline void ei_pstore(Scalar* to, con
|
|||||||
template<typename Scalar, typename Packet> inline void ei_pstoreu(Scalar* to, const Packet& from)
|
template<typename Scalar, typename Packet> inline void ei_pstoreu(Scalar* to, const Packet& from)
|
||||||
{ (*to) = from; }
|
{ (*to) = from; }
|
||||||
|
|
||||||
|
/** \internal tries to do cache prefetching of \a addr */
|
||||||
|
template<typename Scalar> inline void ei_prefetch(const Scalar* addr)
|
||||||
|
{
|
||||||
|
#if !defined(_MSC_VER)
|
||||||
|
__builtin_prefetch(addr);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \returns the first element of a packet */
|
/** \internal \returns the first element of a packet */
|
||||||
template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pfirst(const Packet& a)
|
template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pfirst(const Packet& a)
|
||||||
{ return a; }
|
{ return a; }
|
||||||
|
@ -67,6 +67,8 @@ typedef __vector unsigned char Packet16uc;
|
|||||||
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
||||||
Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
|
Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
|
||||||
|
|
||||||
|
#define DST_CHAN 1
|
||||||
|
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
|
||||||
|
|
||||||
// Define global static constants:
|
// Define global static constants:
|
||||||
static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
|
static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
|
||||||
@ -291,8 +293,8 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f
|
|||||||
edgeAlign = vec_lvsl(0, to); // permute map to extract edges
|
edgeAlign = vec_lvsl(0, to); // permute map to extract edges
|
||||||
edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
|
edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
|
||||||
align = vec_lvsr( 0, to ); // permute map to misalign data
|
align = vec_lvsr( 0, to ); // permute map to misalign data
|
||||||
MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
|
MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
|
||||||
LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
|
LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
|
||||||
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
|
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
|
||||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
||||||
}
|
}
|
||||||
@ -315,6 +317,9 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i
|
|||||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
|
template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
|
||||||
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
|
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
|
||||||
|
|
||||||
|
@ -53,6 +53,10 @@ typedef int32x4_t Packet4i;
|
|||||||
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
||||||
const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
|
const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
|
||||||
|
|
||||||
|
#ifndef __pld
|
||||||
|
#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
|
||||||
|
#endif
|
||||||
|
|
||||||
template<> struct ei_packet_traits<float> : ei_default_packet_traits
|
template<> struct ei_packet_traits<float> : ei_default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4f type; enum {size=4};
|
typedef Packet4f type; enum {size=4};
|
||||||
@ -168,6 +172,9 @@ template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i
|
|||||||
template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
|
template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { __pld(addr); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { __pld(addr); }
|
||||||
|
|
||||||
// FIXME only store the 2 first elements ?
|
// FIXME only store the 2 first elements ?
|
||||||
template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
|
template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
|
||||||
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
|
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
|
||||||
|
@ -233,6 +233,10 @@ template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2
|
|||||||
template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castps_pd(from)); }
|
template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castps_pd(from)); }
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
|
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
|
||||||
#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
|
#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
|
||||||
// The temporary variable fixes an internal compilation error.
|
// The temporary variable fixes an internal compilation error.
|
||||||
// Direct of the struct members fixed bug #62.
|
// Direct of the struct members fixed bug #62.
|
||||||
|
@ -117,9 +117,7 @@ struct ei_gebp_kernel
|
|||||||
for(int i=0; i<peeled_mc; i+=mr)
|
for(int i=0; i<peeled_mc; i+=mr)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// TODO move the res loads to the stores
|
// TODO move the res loads to the stores
|
||||||
|
|
||||||
@ -139,12 +137,10 @@ struct ei_gebp_kernel
|
|||||||
Scalar* r2 = r1 + resStride;
|
Scalar* r2 = r1 + resStride;
|
||||||
Scalar* r3 = r2 + resStride;
|
Scalar* r3 = r2 + resStride;
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(r0+16);
|
||||||
_mm_prefetch((const char*)(r0+16), _MM_HINT_T0);
|
ei_prefetch(r1+16);
|
||||||
_mm_prefetch((const char*)(r1+16), _MM_HINT_T0);
|
ei_prefetch(r2+16);
|
||||||
_mm_prefetch((const char*)(r2+16), _MM_HINT_T0);
|
ei_prefetch(r3+16);
|
||||||
_mm_prefetch((const char*)(r3+16), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// performs "inner" product
|
// performs "inner" product
|
||||||
// TODO let's check wether the folowing peeled loop could not be
|
// TODO let's check wether the folowing peeled loop could not be
|
||||||
@ -334,9 +330,7 @@ struct ei_gebp_kernel
|
|||||||
{
|
{
|
||||||
int i = peeled_mc;
|
int i = peeled_mc;
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
|
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// gets res block as register
|
// gets res block as register
|
||||||
PacketType C0, C1, C2, C3;
|
PacketType C0, C1, C2, C3;
|
||||||
@ -464,9 +458,7 @@ struct ei_gebp_kernel
|
|||||||
for(int i=peeled_mc2; i<rows; i++)
|
for(int i=peeled_mc2; i<rows; i++)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// gets a 1 x nr res block as registers
|
// gets a 1 x nr res block as registers
|
||||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||||
@ -524,9 +516,7 @@ struct ei_gebp_kernel
|
|||||||
for(int i=0; i<peeled_mc; i+=mr)
|
for(int i=0; i<peeled_mc; i+=mr)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// TODO move the res loads to the stores
|
// TODO move the res loads to the stores
|
||||||
|
|
||||||
@ -557,9 +547,7 @@ struct ei_gebp_kernel
|
|||||||
{
|
{
|
||||||
int i = peeled_mc;
|
int i = peeled_mc;
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
|
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||||
|
|
||||||
@ -576,9 +564,7 @@ struct ei_gebp_kernel
|
|||||||
for(int i=peeled_mc2; i<rows; i++)
|
for(int i=peeled_mc2; i<rows; i++)
|
||||||
{
|
{
|
||||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
ei_prefetch(&blA[0]);
|
||||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// gets a 1 x 1 res block as registers
|
// gets a 1 x 1 res block as registers
|
||||||
Scalar C0(0);
|
Scalar C0(0);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user