mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-21 12:24:25 +08:00
WIP 2
This commit is contained in:
parent
a8ec6d6a36
commit
9b8cdceea8
@ -92,6 +92,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
|||||||
_acc.packet[2] = pset1<AccPacket>(0);
|
_acc.packet[2] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -127,6 +129,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
|||||||
_acc.packet[1] = pset1<AccPacket>(0);
|
_acc.packet[1] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -158,6 +162,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
|||||||
_acc = pset1<AccPacket>(0);
|
_acc = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -186,6 +192,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
|||||||
_acc = pset1<AccPacket>(0);
|
_acc = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -216,6 +224,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
_acc.packet[3] = pset1<AccPacket>(0);
|
_acc.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
dest.getLinearMapper(row, col + 0).prefetch(0);
|
||||||
|
dest.getLinearMapper(row, col + 1).prefetch(0);
|
||||||
|
dest.getLinearMapper(row, col + 2).prefetch(0);
|
||||||
|
dest.getLinearMapper(row, col + 3).prefetch(0);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -227,15 +243,17 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
|
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||||
|
|
||||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||||
|
|
||||||
r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
|
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
|
||||||
r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
|
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
|
||||||
r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
|
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
|
||||||
r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
|
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -262,6 +280,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
|||||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -278,26 +304,22 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||||
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
|
||||||
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
|
||||||
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
|
||||||
|
|
||||||
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||||
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||||
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||||
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||||
|
|
||||||
|
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||||
|
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||||
|
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||||
|
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||||
|
|
||||||
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||||
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||||
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||||
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||||
|
|
||||||
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
|
||||||
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
|
||||||
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
|
||||||
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -330,6 +352,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
|||||||
_acc3.packet[3] = pset1<AccPacket>(0);
|
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||||
|
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
{
|
||||||
@ -351,94 +381,139 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||||
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
|
||||||
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
|
||||||
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
|
||||||
|
|
||||||
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||||
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||||
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||||
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||||
|
|
||||||
LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0);
|
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||||
LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1);
|
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||||
LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2);
|
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||||
LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3);
|
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||||
|
|
||||||
|
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||||
|
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||||
|
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||||
|
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||||
|
|
||||||
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
|
||||||
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
|
||||||
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
|
||||||
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
|
||||||
|
|
||||||
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
|
||||||
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
|
||||||
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
|
||||||
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
|
||||||
|
|
||||||
r20.storePacket(0, r20.template loadPacket<ResPacket>(0) + _acc3.packet[0]);
|
|
||||||
r21.storePacket(0, r21.template loadPacket<ResPacket>(0) + _acc3.packet[1]);
|
|
||||||
r22.storePacket(0, r22.template loadPacket<ResPacket>(0) + _acc3.packet[2]);
|
|
||||||
r23.storePacket(0, r23.template loadPacket<ResPacket>(0) + _acc3.packet[3]);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
|
||||||
// {
|
{
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
// RhsPackMap& rhsPackMap,
|
RhsPackMap& rhsPackMap,
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
// Accumulator& acc)
|
Accumulator& acc)
|
||||||
// {
|
{
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
|
|
||||||
// LhsPacket pLhs, pLhs2;
|
LhsPacket pLhs;
|
||||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
#if __UNROLL__ > 4
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_4x1x4();
|
||||||
|
MICRO_4x1x4();
|
||||||
|
#endif
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
// };
|
};
|
||||||
// };
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
|
||||||
// {
|
{
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
// RhsPackMap& rhsPackMap,
|
RhsPackMap& rhsPackMap,
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
// Accumulator& acc)
|
Accumulator& acc)
|
||||||
// {
|
{
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
||||||
|
|
||||||
// LhsPacket pLhs, pLhs2;
|
LhsPacket pLhs, pLhs2;
|
||||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
// prefetch(lhsPackMap.pCur + 2*32);
|
#if __UNROLL__ == 8
|
||||||
// prefetch(rhsPackMap.pCur + 2*16);
|
#ifdef __ENABLE_PREFETCH__
|
||||||
// MICRO_8x1x4();
|
prefetch(rhsPackMap.pCur + (48+0));
|
||||||
// MICRO_8x1x4();
|
#endif
|
||||||
// MICRO_8x1x4();
|
MICRO_8x1x4();
|
||||||
// MICRO_8x1x4();
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
#ifdef __ENABLE_PREFETCH__
|
||||||
|
prefetch(rhsPackMap.pCur + (48+16));
|
||||||
|
#endif
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
#else
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
MICRO_8x1x4();
|
||||||
|
#endif
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// };
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
|
||||||
// };
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
|
||||||
|
|
||||||
|
LhsPacket pLhs, pLhs2, pLhs3;
|
||||||
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
|
#if __UNROLL__ == 8
|
||||||
|
#ifdef __ENABLE_PREFETCH__
|
||||||
|
prefetch(rhsPackMap.pCur);
|
||||||
|
#endif
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
#else
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
MICRO_12x1x4();
|
||||||
|
#endif
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
|
||||||
@ -451,14 +526,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
|
||||||
|
|
||||||
LhsPacket pLhs, pLhs2, pLhs3;
|
LhsPacket pLhs, pLhs2, pLhs3;
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
MICRO_12x1x4();
|
MICRO_12x1x4();
|
||||||
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -484,58 +559,6 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
|
||||||
// {
|
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
|
||||||
// RhsPackMap& rhsPackMap,
|
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
|
||||||
// Accumulator& acc)
|
|
||||||
// {
|
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
|
||||||
// LhsPacket pLhs;
|
|
||||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
|
||||||
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
|
||||||
// };
|
|
||||||
// };
|
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
|
|
||||||
// {
|
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
|
||||||
// RhsPackMap& rhsPackMap,
|
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
|
||||||
// Accumulator& acc)
|
|
||||||
// {
|
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
|
||||||
// LhsPacket pLhs;
|
|
||||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
|
||||||
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
// MICRO_4x1x4();
|
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
|
||||||
// };
|
|
||||||
// };
|
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
||||||
{
|
{
|
||||||
|
@ -18,8 +18,12 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
#ifndef __UNROLL__
|
||||||
|
#define __UNROLL__ 8
|
||||||
|
#endif
|
||||||
|
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES_COUNT = 8;
|
constexpr int SHAPES_COUNT = 11;
|
||||||
|
|
||||||
constexpr int SHAPES_DIMENSION = 6;
|
constexpr int SHAPES_DIMENSION = 6;
|
||||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||||
@ -43,14 +47,17 @@ constexpr int PACK_SHAPES_END = -1;
|
|||||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||||
{ /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 1 */{1*packet_traits<RhsScalar>::size,1,1, 0, 0, SHAPES_POINTER_END},
|
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||||
/* 2 */{2*packet_traits<RhsScalar>::size,1,1, 0, 1, SHAPES_POINTER_END},
|
/* 02 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
|
||||||
/* 3 */{3*packet_traits<RhsScalar>::size,1,1, 0, 2, SHAPES_POINTER_END},
|
/* 03 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
||||||
/* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
/* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 5 */{1*packet_traits<RhsScalar>::size,1,4, 3, 4, SHAPES_POINTER_END},
|
/* 05 */{1*packet_traits<RhsScalar>::size, 1,4, 3, 4, SHAPES_POINTER_END},
|
||||||
/* 6 */{2*packet_traits<RhsScalar>::size,1,4, 3, 5, SHAPES_POINTER_END},
|
/* 06 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END},
|
||||||
/* 7 */{3*packet_traits<RhsScalar>::size,1,4, 3, 6, SHAPES_POINTER_END}};
|
/* 07 */{2*packet_traits<RhsScalar>::size, 1,4, 3, 6, SHAPES_POINTER_END},
|
||||||
|
/* 08 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 6, 7},
|
||||||
|
/* 09 */{3*packet_traits<RhsScalar>::size, 1,4, 3, 8, SHAPES_POINTER_END},
|
||||||
|
/* 10 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 8, 9}};
|
||||||
|
|
||||||
// d1progress x d2progress
|
// d1progress x d2progress
|
||||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||||
@ -225,6 +232,8 @@ struct Accumulator
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket>
|
template<typename ResPacket>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
|
||||||
{
|
{
|
||||||
@ -305,11 +314,21 @@ struct DepthLoopStruct
|
|||||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
||||||
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
||||||
|
|
||||||
|
#ifdef __ENABLE_PREFETCH__
|
||||||
|
prefetch(lhsPackMap.pCur);
|
||||||
|
prefetch(rhsPackMap.pCur);
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||||
|
|
||||||
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
|
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
|
||||||
AccumulatorType acc;
|
AccumulatorType acc;
|
||||||
acc.zero();
|
acc.zero();
|
||||||
|
|
||||||
|
#ifdef __ENABLE_PREFETCH__
|
||||||
|
acc.prefetch(res, rowIdx, colIdx);
|
||||||
|
#endif
|
||||||
|
|
||||||
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
||||||
{
|
{
|
||||||
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
|
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
echo 'Compiling with master'
|
#echo 'Compiling with master'
|
||||||
g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
||||||
echo 'Compiling current'
|
echo 'Compiling current'
|
||||||
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt
|
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gt
|
@ -15,10 +15,11 @@ void set(MatrixXf& A, int m, int n, int id, int digits)
|
|||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
#ifdef __DEBUG__
|
#ifdef __DEBUG__
|
||||||
int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n);
|
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
|
||||||
MatrixXf A = MatrixXf::Zero(m, k);
|
MatrixXf A = MatrixXf::Zero(m, k);
|
||||||
MatrixXf B = MatrixXf::Zero(k, n);
|
MatrixXf B = MatrixXf::Zero(k, n);
|
||||||
MatrixXf C = MatrixXf::Zero(m, n);
|
MatrixXf C = MatrixXf::Zero(m, n);
|
||||||
|
MatrixXf D = MatrixXf::Zero(m, n);
|
||||||
|
|
||||||
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||||
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||||
@ -40,17 +41,25 @@ int main(int argc, char* argv[])
|
|||||||
{
|
{
|
||||||
acc += A(i,kk)*B(kk,j);
|
acc += A(i,kk)*B(kk,j);
|
||||||
}
|
}
|
||||||
C(i,j) = acc;
|
D(i,j) = acc;
|
||||||
//std::cout << acc << " ";
|
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
|
||||||
|
{
|
||||||
|
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
//std::cout << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << C << std::endl;
|
std::cout << C << std::endl;
|
||||||
#else
|
#else
|
||||||
int sz = 128;
|
if(argc < 3)
|
||||||
|
{
|
||||||
|
std::cout << "Wrong number of arguments." << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sz = std::atoi(argv[1]);
|
||||||
int m = sz, k = sz, n = sz;
|
int m = sz, k = sz, n = sz;
|
||||||
int RUNS = 500;
|
int RUNS = std::atoi(argv[2]);
|
||||||
double time = 0;
|
double time = 0;
|
||||||
|
|
||||||
for(auto i = 0; i < RUNS; i++)
|
for(auto i = 0; i < RUNS; i++)
|
||||||
|
63
run.sh
63
run.sh
@ -1,32 +1,33 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
echo 'Running with master'
|
function run() {
|
||||||
T_OLD1=$(./gto)
|
OLD=0
|
||||||
echo $T_OLD1
|
NEW=0
|
||||||
echo 'Running current'
|
EXECS=$1
|
||||||
T_NEW1=$(./gt)
|
SIZE=$2
|
||||||
echo $T_NEW1
|
RUNS=$3
|
||||||
echo 'Running with master'
|
for ((i = 0; i < $EXECS; i++)) do
|
||||||
T_OLD2=$(./gto)
|
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
|
||||||
echo $T_OLD2
|
if [ $SEL -eq 0 ]; then
|
||||||
echo 'Running with master'
|
T_OLD=$(./gto $SIZE $RUNS)
|
||||||
T_OLD3=$(./gto)
|
#echo "Master: $T_OLD"
|
||||||
echo $T_OLD3
|
OLD=$OLD+$T_OLD
|
||||||
echo 'Running current'
|
T_NEW=$(./gt $SIZE $RUNS)
|
||||||
T_NEW2=$(./gt)
|
#echo "Current: $T_NEW"
|
||||||
echo $T_NEW2
|
else
|
||||||
echo 'Running with master'
|
T_NEW=$(./gt $SIZE $RUNS)
|
||||||
T_OLD4=$(./gto)
|
#echo "Current: $T_NEW"
|
||||||
echo $T_OLD4
|
T_OLD=$(./gto $SIZE $RUNS)
|
||||||
echo 'Running current'
|
#echo "Master: $T_OLD"
|
||||||
T_NEW3=$(./gt)
|
OLD=$OLD+$T_OLD
|
||||||
echo $T_NEW3
|
fi
|
||||||
echo 'Running current'
|
NEW=$NEW+$T_NEW
|
||||||
T_NEW4=$(./gt)
|
done
|
||||||
echo $T_NEW4
|
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
|
||||||
echo 'Running with master'
|
echo "$SIZE -> $SPEED"
|
||||||
T_OLD5=$(./gto)
|
}
|
||||||
echo $T_OLD5
|
|
||||||
echo 'Running current'
|
run $1 16 500
|
||||||
T_NEW5=$(./gt)
|
run $1 32 500
|
||||||
echo $T_NEW5
|
run $1 64 500
|
||||||
echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l
|
run $1 128 100
|
||||||
|
run $1 256 100
|
||||||
|
Loading…
x
Reference in New Issue
Block a user