This commit is contained in:
Everton Constantino 2021-04-22 20:42:44 +00:00
parent 82a7715b01
commit c62ed9b214
2 changed files with 247 additions and 12 deletions

View File

@ -14,6 +14,13 @@ namespace Eigen {
namespace internal {
template<int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 7;
template<int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{{1,1,1,SHAPES_POINTER_END},{4,1,1,0},{1,1,4,1},{4,1,4,1},{4,4,4,1},{8,1,4,1},{8,4,4,1}};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
{
@ -111,6 +118,172 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc1;
PacketBlock<AccPacket, 4> _acc2;
EIGEN_STRONG_INLINE void zero()
{
_acc1.packet[0] = pset1<AccPacket>(0);
_acc1.packet[1] = pset1<AccPacket>(0);
_acc1.packet[2] = pset1<AccPacket>(0);
_acc1.packet[3] = pset1<AccPacket>(0);
_acc2.packet[0] = pset1<AccPacket>(0);
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0);
_acc2.packet[3] = pset1<AccPacket>(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc1.packet[0] *= pAlpha;
_acc1.packet[1] *= pAlpha;
_acc1.packet[2] *= pAlpha;
_acc1.packet[3] *= pAlpha;
_acc2.packet[0] *= pAlpha;
_acc2.packet[1] *= pAlpha;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
}
};
#define MICRO_8x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc1.packet[0] += pLhs*pRhs0; \
acc._acc1.packet[1] += pLhs*pRhs1; \
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \
acc._acc2.packet[0] += pLhs2*pRhs0; \
acc._acc2.packet[1] += pLhs2*pRhs1; \
acc._acc2.packet[2] += pLhs2*pRhs2; \
acc._acc2.packet[3] += pLhs2*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_8x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
};
};
#define MICRO_4x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc.packet[0] += pLhs*pRhs0; \
acc._acc.packet[1] += pLhs*pRhs1; \
acc._acc.packet[2] += pLhs*pRhs2; \
acc._acc.packet[3] += pLhs*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
{
@ -123,20 +296,12 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
RhsPacket pRhs0 = pset1<RhsPacket>(pRhs[0]);
RhsPacket pRhs1 = pset1<RhsPacket>(pRhs[1]);
RhsPacket pRhs2 = pset1<RhsPacket>(pRhs[2]);
RhsPacket pRhs3 = pset1<RhsPacket>(pRhs[3]);
acc._acc.packet[0] += pLhs*pRhs0;
acc._acc.packet[1] += pLhs*pRhs1;
acc._acc.packet[2] += pLhs*pRhs2;
acc._acc.packet[3] += pLhs*pRhs3;
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
lhsPackMap.advance(4*1);
rhsPackMap.advance(1*4);
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
};
};

View File

@ -17,9 +17,15 @@ namespace internal {
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4;
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}};
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4>
{
@ -69,6 +75,70 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx + 0, d2Idx);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d1Idx + 4, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx + 0);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d2Idx, d1Idx + 4);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx + 0);
c++;
*c = data(d1Idx + 1, d2Idx + 0);
c++;
*c = data(d1Idx + 2, d2Idx + 0);
c++;
*c = data(d1Idx + 3, d2Idx + 0);
c++;
*c = data(d1Idx + 0, d2Idx + 4);
c++;
*c = data(d1Idx + 1, d2Idx + 4);
c++;
*c = data(d1Idx + 2, d2Idx + 4);
c++;
*c = data(d1Idx + 3, d2Idx + 4);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
*c = data(d2Idx + 4, d1Idx + 0);
c++;
*c = data(d2Idx + 4, d1Idx + 1);
c++;
*c = data(d2Idx + 4, d1Idx + 2);
c++;
*c = data(d2Idx + 4, d1Idx + 3);
c++;
}
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1>
{