bug #955 - Implement a rotating kernel alternative in the 3px4 gebp path

This is substantially faster on ARM, where it's important to minimize the number of loads.

This is specific to the case where all packet types are of size 4. I made my best attempt to minimize how dirty this is... opinions welcome.

Eventually one could have a generic rotated kernel, but it would take some work to get there. Also, on sandy bridge, in my experience, it's not beneficial (even about 1% slower).
This commit is contained in:
Benoit Jacob 2015-02-18 15:03:35 -05:00
parent ee27d50633
commit 9bd8a4bab5
5 changed files with 133 additions and 19 deletions

View File

@ -287,6 +287,21 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
{ return a; }
template<size_t offset, typename Packet>
struct protate_impl
{
static Packet run(const Packet& a) { return a; }
};
/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
* by the given offset, e.g. for offset == 1:
* (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
*/
template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
{
EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
return offset ? protate_impl<offset, Packet>::run(a) : a;
}
/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)

View File

@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
a_hi = vget_high_s32(a_r64);
return vcombine_s32(a_hi, a_lo);
}
template<size_t offset>
struct protate_impl<offset, Packet4f>
{
static Packet4f run(const Packet4f& a) {
return vextq_f32(a, a, offset);
}
};
template<size_t offset>
struct protate_impl<offset, Packet4i>
{
static Packet4i run(const Packet4i& a) {
return vextq_s32(a, a, offset);
}
};
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
template<size_t offset>
struct protate_impl<offset, Packet2d>
{
static Packet2d run(const Packet2d& a) {
return vextq_f64(a, a, offset);
}
};
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)

View File

@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
{ return _mm_shuffle_epi32(a,0x1B); }
template<size_t offset>
struct protate_impl<offset, Packet4f>
{
static Packet4f run(const Packet4f& a) {
return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
}
};
template<size_t offset>
struct protate_impl<offset, Packet4i>
{
static Packet4i run(const Packet4i& a) {
return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
}
};
template<size_t offset>
struct protate_impl<offset, Packet2d>
{
static Packet2d run(const Packet2d& a) {
return vec2d_swizzle1(a, offset, (offset + 1) % 2);
}
};
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
{

View File

@ -771,7 +771,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const Index peeled_kc = depth & ~(pk-1);
const Index prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
#if EIGEN_ARCH_ARM
const bool PreferRotatingKernel = true;
#else
const bool PreferRotatingKernel = false;
#endif
const bool UseRotatingKernel =
PreferRotatingKernel &&
Traits::LhsPacketSize == 4 &&
Traits::RhsPacketSize == 4 &&
Traits::ResPacketSize == 4;
//---------- Process 3 * LhsProgress rows at once ----------
// This corresponds to 3*LhsProgress x nr register blocks.
// Usually, make sense only with FMA
@ -818,7 +830,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
RhsPacket B_0, T0;
LhsPacket A2;
#define EIGEN_GEBGP_ONESTEP(K) \
#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
do { \
if (UseRotatingKernel) { \
if (N == 0) { \
B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
} else { \
EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
B_0 = protate<1>(B_0); \
} \
} else { \
traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \
} \
} while (false)
#define EIGEN_GEBP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
@ -827,34 +853,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
traits.madd(A0, B_0, C0, T0); \
traits.madd(A1, B_0, C4, T0); \
traits.madd(A2, B_0, C8, B_0); \
traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
traits.madd(A0, B_0, C1, T0); \
traits.madd(A1, B_0, C5, T0); \
traits.madd(A2, B_0, C9, B_0); \
traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
traits.madd(A0, B_0, C2, T0); \
traits.madd(A1, B_0, C6, T0); \
traits.madd(A2, B_0, C10, B_0); \
traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
traits.madd(A0, B_0, C3 , T0); \
traits.madd(A1, B_0, C7, T0); \
traits.madd(A2, B_0, C11, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
} while(false)
internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
EIGEN_GEBGP_ONESTEP(3);
EIGEN_GEBGP_ONESTEP(4);
EIGEN_GEBGP_ONESTEP(5);
EIGEN_GEBGP_ONESTEP(6);
EIGEN_GEBGP_ONESTEP(7);
EIGEN_GEBP_ONESTEP(0);
EIGEN_GEBP_ONESTEP(1);
EIGEN_GEBP_ONESTEP(2);
EIGEN_GEBP_ONESTEP(3);
EIGEN_GEBP_ONESTEP(4);
EIGEN_GEBP_ONESTEP(5);
EIGEN_GEBP_ONESTEP(6);
EIGEN_GEBP_ONESTEP(7);
blB += pk*4*RhsProgress;
blA += pk*3*Traits::LhsProgress;
@ -866,12 +892,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
{
RhsPacket B_0, T0;
LhsPacket A2;
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 3*Traits::LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
#undef EIGEN_GEBP_ONESTEP
if (UseRotatingKernel) {
#define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
do { \
PacketBlock<ResPacket> resblock; \
resblock.packet[0] = res0; \
resblock.packet[1] = res1; \
resblock.packet[2] = res2; \
resblock.packet[3] = res3; \
ptranspose(resblock); \
resblock.packet[3] = protate<1>(resblock.packet[3]); \
resblock.packet[2] = protate<2>(resblock.packet[2]); \
resblock.packet[1] = protate<3>(resblock.packet[1]); \
ptranspose(resblock); \
res0 = resblock.packet[0]; \
res1 = resblock.packet[1]; \
res2 = resblock.packet[2]; \
res3 = resblock.packet[3]; \
} while (false)
EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
}
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);

View File

@ -93,7 +93,8 @@
THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
STORAGE_LAYOUT_DOES_NOT_MATCH
STORAGE_LAYOUT_DOES_NOT_MATCH,
ROTATION_BY_ILLEGAL_OFFSET
};
};