mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-19 19:34:29 +08:00
WIP 2
This commit is contained in:
parent
a8ec6d6a36
commit
9b8cdceea8
@ -92,6 +92,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||
_acc.packet[2] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -127,6 +129,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -157,6 +161,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||
{
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
@ -186,6 +192,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -216,6 +224,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
_acc.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -227,15 +243,17 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
|
||||
r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
|
||||
r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
|
||||
r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
@ -262,6 +280,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -278,26 +304,22 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
||||
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
||||
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
||||
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
||||
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
||||
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
||||
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
|
||||
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
||||
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
||||
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
||||
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
||||
|
||||
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
||||
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
||||
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
||||
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
@ -330,6 +352,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
@ -351,94 +381,139 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
||||
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
||||
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
||||
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
||||
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
||||
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
||||
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0);
|
||||
LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1);
|
||||
LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2);
|
||||
LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3);
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
|
||||
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
||||
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
||||
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
||||
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
||||
|
||||
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
||||
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
||||
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
||||
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
||||
|
||||
r20.storePacket(0, r20.template loadPacket<ResPacket>(0) + _acc3.packet[0]);
|
||||
r21.storePacket(0, r21.template loadPacket<ResPacket>(0) + _acc3.packet[1]);
|
||||
r22.storePacket(0, r22.template loadPacket<ResPacket>(0) + _acc3.packet[2]);
|
||||
r23.storePacket(0, r23.template loadPacket<ResPacket>(0) + _acc3.packet[3]);
|
||||
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
|
||||
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
|
||||
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
|
||||
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
||||
// {
|
||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
// RhsPackMap& rhsPackMap,
|
||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||
// Accumulator& acc)
|
||||
// {
|
||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
|
||||
// LhsPacket pLhs, pLhs2;
|
||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
#if __UNROLL__ > 4
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
#endif
|
||||
|
||||
// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
// };
|
||||
// };
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
|
||||
// {
|
||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
// RhsPackMap& rhsPackMap,
|
||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||
// Accumulator& acc)
|
||||
// {
|
||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
|
||||
// LhsPacket pLhs, pLhs2;
|
||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
LhsPacket pLhs, pLhs2;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
// prefetch(lhsPackMap.pCur + 2*32);
|
||||
// prefetch(rhsPackMap.pCur + 2*16);
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
// MICRO_8x1x4();
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+0));
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+16));
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#else
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#endif
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
|
||||
// };
|
||||
// };
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur);
|
||||
#endif
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
#else
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
#endif
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
|
||||
@ -451,14 +526,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_12x1x4();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
@ -484,58 +559,6 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
};
|
||||
};
|
||||
|
||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
||||
// {
|
||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
// RhsPackMap& rhsPackMap,
|
||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||
// Accumulator& acc)
|
||||
// {
|
||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
// LhsPacket pLhs;
|
||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
|
||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
// };
|
||||
// };
|
||||
|
||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
|
||||
// {
|
||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
// RhsPackMap& rhsPackMap,
|
||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||
// Accumulator& acc)
|
||||
// {
|
||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
// LhsPacket pLhs;
|
||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
// MICRO_4x1x4();
|
||||
|
||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
// };
|
||||
// };
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
||||
{
|
||||
|
@ -18,8 +18,12 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#ifndef __UNROLL__
|
||||
#define __UNROLL__ 8
|
||||
#endif
|
||||
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES_COUNT = 8;
|
||||
constexpr int SHAPES_COUNT = 11;
|
||||
|
||||
constexpr int SHAPES_DIMENSION = 6;
|
||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||
@ -43,14 +47,17 @@ constexpr int PACK_SHAPES_END = -1;
|
||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||
{ /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 1 */{1*packet_traits<RhsScalar>::size,1,1, 0, 0, SHAPES_POINTER_END},
|
||||
/* 2 */{2*packet_traits<RhsScalar>::size,1,1, 0, 1, SHAPES_POINTER_END},
|
||||
/* 3 */{3*packet_traits<RhsScalar>::size,1,1, 0, 2, SHAPES_POINTER_END},
|
||||
/* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 5 */{1*packet_traits<RhsScalar>::size,1,4, 3, 4, SHAPES_POINTER_END},
|
||||
/* 6 */{2*packet_traits<RhsScalar>::size,1,4, 3, 5, SHAPES_POINTER_END},
|
||||
/* 7 */{3*packet_traits<RhsScalar>::size,1,4, 3, 6, SHAPES_POINTER_END}};
|
||||
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||
/* 02 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
|
||||
/* 03 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
||||
/* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 05 */{1*packet_traits<RhsScalar>::size, 1,4, 3, 4, SHAPES_POINTER_END},
|
||||
/* 06 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END},
|
||||
/* 07 */{2*packet_traits<RhsScalar>::size, 1,4, 3, 6, SHAPES_POINTER_END},
|
||||
/* 08 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 6, 7},
|
||||
/* 09 */{3*packet_traits<RhsScalar>::size, 1,4, 3, 8, SHAPES_POINTER_END},
|
||||
/* 10 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 8, 9}};
|
||||
|
||||
// d1progress x d2progress
|
||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||
@ -225,6 +232,8 @@ struct Accumulator
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
|
||||
{
|
||||
@ -305,11 +314,21 @@ struct DepthLoopStruct
|
||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
||||
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(lhsPackMap.pCur);
|
||||
prefetch(rhsPackMap.pCur);
|
||||
#endif
|
||||
|
||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||
|
||||
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
|
||||
AccumulatorType acc;
|
||||
acc.zero();
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
acc.prefetch(res, rowIdx, colIdx);
|
||||
#endif
|
||||
|
||||
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
||||
{
|
||||
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/bin/bash
|
||||
echo 'Compiling with master'
|
||||
g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
||||
#echo 'Compiling with master'
|
||||
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
||||
echo 'Compiling current'
|
||||
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt
|
||||
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gt
|
@ -15,10 +15,11 @@ void set(MatrixXf& A, int m, int n, int id, int digits)
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n);
|
||||
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
|
||||
MatrixXf A = MatrixXf::Zero(m, k);
|
||||
MatrixXf B = MatrixXf::Zero(k, n);
|
||||
MatrixXf C = MatrixXf::Zero(m, n);
|
||||
MatrixXf D = MatrixXf::Zero(m, n);
|
||||
|
||||
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||
@ -40,17 +41,25 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
acc += A(i,kk)*B(kk,j);
|
||||
}
|
||||
C(i,j) = acc;
|
||||
//std::cout << acc << " ";
|
||||
D(i,j) = acc;
|
||||
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
|
||||
{
|
||||
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
|
||||
}
|
||||
}
|
||||
//std::cout << std::endl;
|
||||
}
|
||||
|
||||
std::cout << C << std::endl;
|
||||
#else
|
||||
int sz = 128;
|
||||
if(argc < 3)
|
||||
{
|
||||
std::cout << "Wrong number of arguments." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sz = std::atoi(argv[1]);
|
||||
int m = sz, k = sz, n = sz;
|
||||
int RUNS = 500;
|
||||
int RUNS = std::atoi(argv[2]);
|
||||
double time = 0;
|
||||
|
||||
for(auto i = 0; i < RUNS; i++)
|
||||
|
63
run.sh
63
run.sh
@ -1,32 +1,33 @@
|
||||
#!/bin/bash
|
||||
echo 'Running with master'
|
||||
T_OLD1=$(./gto)
|
||||
echo $T_OLD1
|
||||
echo 'Running current'
|
||||
T_NEW1=$(./gt)
|
||||
echo $T_NEW1
|
||||
echo 'Running with master'
|
||||
T_OLD2=$(./gto)
|
||||
echo $T_OLD2
|
||||
echo 'Running with master'
|
||||
T_OLD3=$(./gto)
|
||||
echo $T_OLD3
|
||||
echo 'Running current'
|
||||
T_NEW2=$(./gt)
|
||||
echo $T_NEW2
|
||||
echo 'Running with master'
|
||||
T_OLD4=$(./gto)
|
||||
echo $T_OLD4
|
||||
echo 'Running current'
|
||||
T_NEW3=$(./gt)
|
||||
echo $T_NEW3
|
||||
echo 'Running current'
|
||||
T_NEW4=$(./gt)
|
||||
echo $T_NEW4
|
||||
echo 'Running with master'
|
||||
T_OLD5=$(./gto)
|
||||
echo $T_OLD5
|
||||
echo 'Running current'
|
||||
T_NEW5=$(./gt)
|
||||
echo $T_NEW5
|
||||
echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l
|
||||
function run() {
|
||||
OLD=0
|
||||
NEW=0
|
||||
EXECS=$1
|
||||
SIZE=$2
|
||||
RUNS=$3
|
||||
for ((i = 0; i < $EXECS; i++)) do
|
||||
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
|
||||
if [ $SEL -eq 0 ]; then
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
#echo "Master: $T_OLD"
|
||||
OLD=$OLD+$T_OLD
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
#echo "Current: $T_NEW"
|
||||
else
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
#echo "Current: $T_NEW"
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
#echo "Master: $T_OLD"
|
||||
OLD=$OLD+$T_OLD
|
||||
fi
|
||||
NEW=$NEW+$T_NEW
|
||||
done
|
||||
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
|
||||
echo "$SIZE -> $SPEED"
|
||||
}
|
||||
|
||||
run $1 16 500
|
||||
run $1 32 500
|
||||
run $1 64 500
|
||||
run $1 128 100
|
||||
run $1 256 100
|
||||
|
Loading…
x
Reference in New Issue
Block a user