This commit is contained in:
Everton Constantino 2021-05-13 14:42:22 +00:00
parent a8ec6d6a36
commit 9b8cdceea8
5 changed files with 248 additions and 196 deletions

View File

@ -92,6 +92,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
_acc.packet[2] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -127,6 +129,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
_acc.packet[1] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -157,6 +161,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
{
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
@ -186,6 +192,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -216,6 +224,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
_acc.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row, col + 0).prefetch(0);
dest.getLinearMapper(row, col + 1).prefetch(0);
dest.getLinearMapper(row, col + 2).prefetch(0);
dest.getLinearMapper(row, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -227,15 +243,17 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
}
};
@ -262,6 +280,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
_acc2.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -278,26 +304,22 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
}
};
@ -330,6 +352,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
_acc3.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
@ -351,94 +381,139 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0);
LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1);
LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2);
LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
r20.storePacket(0, r20.template loadPacket<ResPacket>(0) + _acc3.packet[0]);
r21.storePacket(0, r21.template loadPacket<ResPacket>(0) + _acc3.packet[1]);
r22.storePacket(0, r22.template loadPacket<ResPacket>(0) + _acc3.packet[2]);
r23.storePacket(0, r23.template loadPacket<ResPacket>(0) + _acc3.packet[3]);
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
}
};
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
// {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
// RhsPackMap& rhsPackMap,
// Index rowIdx, Index colIdx, Index depthIdx,
// Accumulator& acc)
// {
// using LhsPacket = typename packet_traits<LhsScalar>::type;
// using RhsPacket = typename packet_traits<RhsScalar>::type;
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
// LhsPacket pLhs, pLhs2;
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#if __UNROLL__ > 4
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#endif
// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
// };
// };
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
};
};
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
// {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
// RhsPackMap& rhsPackMap,
// Index rowIdx, Index colIdx, Index depthIdx,
// Accumulator& acc)
// {
// using LhsPacket = typename packet_traits<LhsScalar>::type;
// using RhsPacket = typename packet_traits<RhsScalar>::type;
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
// LhsPacket pLhs, pLhs2;
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
// prefetch(lhsPackMap.pCur + 2*32);
// prefetch(rhsPackMap.pCur + 2*16);
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
// MICRO_8x1x4();
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+0));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+16));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#else
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
};
};
// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
// };
// };
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur);
#endif
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#else
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
@ -451,14 +526,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_12x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
};
};
@ -484,58 +559,6 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
};
};
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
// {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
// RhsPackMap& rhsPackMap,
// Index rowIdx, Index colIdx, Index depthIdx,
// Accumulator& acc)
// {
// using LhsPacket = typename packet_traits<LhsScalar>::type;
// using RhsPacket = typename packet_traits<RhsScalar>::type;
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
// LhsPacket pLhs;
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
// };
// };
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
// {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
// RhsPackMap& rhsPackMap,
// Index rowIdx, Index colIdx, Index depthIdx,
// Accumulator& acc)
// {
// using LhsPacket = typename packet_traits<LhsScalar>::type;
// using RhsPacket = typename packet_traits<RhsScalar>::type;
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
// LhsPacket pLhs;
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// MICRO_4x1x4();
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
// };
// };
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
{

View File

@ -18,8 +18,12 @@ namespace Eigen {
namespace internal {
#ifndef __UNROLL__
#define __UNROLL__ 8
#endif
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT = 8;
constexpr int SHAPES_COUNT = 11;
constexpr int SHAPES_DIMENSION = 6;
constexpr int SHAPES_LHS_DIMENSION = 0;
@ -43,14 +47,17 @@ constexpr int PACK_SHAPES_END = -1;
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{ /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 1 */{1*packet_traits<RhsScalar>::size,1,1, 0, 0, SHAPES_POINTER_END},
/* 2 */{2*packet_traits<RhsScalar>::size,1,1, 0, 1, SHAPES_POINTER_END},
/* 3 */{3*packet_traits<RhsScalar>::size,1,1, 0, 2, SHAPES_POINTER_END},
/* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 5 */{1*packet_traits<RhsScalar>::size,1,4, 3, 4, SHAPES_POINTER_END},
/* 6 */{2*packet_traits<RhsScalar>::size,1,4, 3, 5, SHAPES_POINTER_END},
/* 7 */{3*packet_traits<RhsScalar>::size,1,4, 3, 6, SHAPES_POINTER_END}};
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
/* 02 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
/* 03 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
/* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 05 */{1*packet_traits<RhsScalar>::size, 1,4, 3, 4, SHAPES_POINTER_END},
/* 06 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END},
/* 07 */{2*packet_traits<RhsScalar>::size, 1,4, 3, 6, SHAPES_POINTER_END},
/* 08 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 6, 7},
/* 09 */{3*packet_traits<RhsScalar>::size, 1,4, 3, 8, SHAPES_POINTER_END},
/* 10 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 8, 9}};
// d1progress x d2progress
template<int Architecture, int CPU, typename Scalar, bool isLhs>
@ -225,6 +232,8 @@ struct Accumulator
}
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
{
@ -305,11 +314,21 @@ struct DepthLoopStruct
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
#ifdef __ENABLE_PREFETCH__
prefetch(lhsPackMap.pCur);
prefetch(rhsPackMap.pCur);
#endif
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
AccumulatorType acc;
acc.zero();
#ifdef __ENABLE_PREFETCH__
acc.prefetch(res, rowIdx, colIdx);
#endif
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
{
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);

View File

@ -1,5 +1,5 @@
#!/bin/bash
echo 'Compiling with master'
g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
#echo 'Compiling with master'
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
echo 'Compiling current'
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gt

View File

@ -15,10 +15,11 @@ void set(MatrixXf& A, int m, int n, int id, int digits)
int main(int argc, char* argv[])
{
#ifdef __DEBUG__
int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n);
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
MatrixXf A = MatrixXf::Zero(m, k);
MatrixXf B = MatrixXf::Zero(k, n);
MatrixXf C = MatrixXf::Zero(m, n);
MatrixXf D = MatrixXf::Zero(m, n);
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
@ -40,17 +41,25 @@ int main(int argc, char* argv[])
{
acc += A(i,kk)*B(kk,j);
}
C(i,j) = acc;
//std::cout << acc << " ";
D(i,j) = acc;
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
{
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
}
}
//std::cout << std::endl;
}
std::cout << C << std::endl;
#else
int sz = 128;
if(argc < 3)
{
std::cout << "Wrong number of arguments." << std::endl;
return -1;
}
int sz = std::atoi(argv[1]);
int m = sz, k = sz, n = sz;
int RUNS = 500;
int RUNS = std::atoi(argv[2]);
double time = 0;
for(auto i = 0; i < RUNS; i++)

63
run.sh
View File

@ -1,32 +1,33 @@
#!/bin/bash
echo 'Running with master'
T_OLD1=$(./gto)
echo $T_OLD1
echo 'Running current'
T_NEW1=$(./gt)
echo $T_NEW1
echo 'Running with master'
T_OLD2=$(./gto)
echo $T_OLD2
echo 'Running with master'
T_OLD3=$(./gto)
echo $T_OLD3
echo 'Running current'
T_NEW2=$(./gt)
echo $T_NEW2
echo 'Running with master'
T_OLD4=$(./gto)
echo $T_OLD4
echo 'Running current'
T_NEW3=$(./gt)
echo $T_NEW3
echo 'Running current'
T_NEW4=$(./gt)
echo $T_NEW4
echo 'Running with master'
T_OLD5=$(./gto)
echo $T_OLD5
echo 'Running current'
T_NEW5=$(./gt)
echo $T_NEW5
echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l
function run() {
OLD=0
NEW=0
EXECS=$1
SIZE=$2
RUNS=$3
for ((i = 0; i < $EXECS; i++)) do
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
if [ $SEL -eq 0 ]; then
T_OLD=$(./gto $SIZE $RUNS)
#echo "Master: $T_OLD"
OLD=$OLD+$T_OLD
T_NEW=$(./gt $SIZE $RUNS)
#echo "Current: $T_NEW"
else
T_NEW=$(./gt $SIZE $RUNS)
#echo "Current: $T_NEW"
T_OLD=$(./gto $SIZE $RUNS)
#echo "Master: $T_OLD"
OLD=$OLD+$T_OLD
fi
NEW=$NEW+$T_NEW
done
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
echo "$SIZE -> $SPEED"
}
run $1 16 500
run $1 32 500
run $1 64 500
run $1 128 100
run $1 256 100