From c62ed9b214ea0af06da7997ac185a0b009b2fb59 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Thu, 22 Apr 2021 20:42:44 +0000 Subject: [PATCH] WIP --- Eigen/src/Core/arch/NEON/Kernels.h | 189 ++++++++++++++++++++++++-- Eigen/src/Core/arch/NEON/PackingOps.h | 70 ++++++++++ 2 files changed, 247 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 65f23a9b6..973f71d06 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -14,6 +14,13 @@ namespace Eigen { namespace internal { +template +constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 7; + +template +constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = + {{1,1,1,SHAPES_POINTER_END},{4,1,1,0},{1,1,4,1},{4,1,4,1},{4,4,4,1},{8,1,4,1},{8,4,4,1}}; + template struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> { @@ -111,6 +118,172 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> } }; +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; + + PacketBlock _acc1; + PacketBlock _acc2; + + EIGEN_STRONG_INLINE void zero() + { + _acc1.packet[0] = pset1(0); + _acc1.packet[1] = pset1(0); + _acc1.packet[2] = pset1(0); + _acc1.packet[3] = pset1(0); + + _acc2.packet[0] = pset1(0); + _acc2.packet[1] = pset1(0); + _acc2.packet[2] = pset1(0); + _acc2.packet[3] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc1.packet[0] *= pAlpha; + _acc1.packet[1] *= pAlpha; + _acc1.packet[2] *= pAlpha; + _acc1.packet[3] *= pAlpha; + + _acc2.packet[0] *= pAlpha; + _acc2.packet[1] *= pAlpha; + _acc2.packet[2] *= pAlpha; + _acc2.packet[3] *= pAlpha; + } + + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); + LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); + LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); + LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + + LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); + LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); + LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); + LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + + + r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); + r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); + r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); + r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); + + r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); + r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); + r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); + r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); + } +}; + +#define MICRO_8x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ + pLhs2 = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc1.packet[0] += pLhs*pRhs0; \ + acc._acc1.packet[1] += pLhs*pRhs1; \ + acc._acc1.packet[2] += pLhs*pRhs2; \ + acc._acc1.packet[3] += pLhs*pRhs3; \ + acc._acc2.packet[0] += pLhs2*pRhs0; \ + acc._acc2.packet[1] += pLhs2*pRhs1; \ + acc._acc2.packet[2] += pLhs2*pRhs2; \ + acc._acc2.packet[3] += pLhs2*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t"); + + LhsPacket pLhs, pLhs2; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t"); + }; +}; + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); + + LhsPacket pLhs, pLhs2; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_8x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); + }; +}; + +#define MICRO_4x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc.packet[0] += pLhs*pRhs0; \ + acc._acc.packet[1] += pLhs*pRhs1; \ + acc._acc.packet[2] += pLhs*pRhs2; \ + acc._acc.packet[3] += pLhs*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); + }; +}; + template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> { @@ -123,20 +296,12 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, using RhsPacket = typename packet_traits::type; asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t"); - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pload(rhsPackMap.pCur); - RhsPacket pRhs0 = pset1(pRhs[0]); - RhsPacket pRhs1 = pset1(pRhs[1]); - RhsPacket pRhs2 = pset1(pRhs[2]); - RhsPacket pRhs3 = pset1(pRhs[3]); - acc._acc.packet[0] += pLhs*pRhs0; - acc._acc.packet[1] += pLhs*pRhs1; - acc._acc.packet[2] += pLhs*pRhs2; - acc._acc.packet[3] += pLhs*pRhs3; + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_4x1x4(); - lhsPackMap.advance(4*1); - rhsPackMap.advance(1*4); asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t"); }; }; diff --git a/Eigen/src/Core/arch/NEON/PackingOps.h b/Eigen/src/Core/arch/NEON/PackingOps.h index bc1629a1a..40dbea568 100644 --- a/Eigen/src/Core/arch/NEON/PackingOps.h +++ b/Eigen/src/Core/arch/NEON/PackingOps.h @@ -17,9 +17,15 @@ namespace internal { template constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3; +template +constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4; + template constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}}; +template +constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}}; + template struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4> { @@ -69,6 +75,70 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane } }; +template +struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1> +{ + EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data) + { + using Packet = typename packet_traits::type; + Scalar *c = block; + if(isLhs && StorageOrder == ColMajor) + { + Packet p = data.template loadPacket(d1Idx + 0, d2Idx); + pstore(c, p); + c+=4; + p = data.template loadPacket(d1Idx + 4, d2Idx); + pstore(c, p); + c+=4; + } else if(!isLhs && StorageOrder == RowMajor) { + Packet p = data.template loadPacket(d2Idx, d1Idx + 0); + pstore(c, p); + c+=4; + p = data.template loadPacket(d2Idx, d1Idx + 4); + pstore(c, p); + c+=4; + } else { + if(isLhs) + { + *c = data(d1Idx + 0, d2Idx + 0); + c++; + *c = data(d1Idx + 1, d2Idx + 0); + c++; + *c = data(d1Idx + 2, d2Idx + 0); + c++; + *c = data(d1Idx + 3, d2Idx + 0); + c++; + *c = data(d1Idx + 0, d2Idx + 4); + c++; + *c = data(d1Idx + 1, d2Idx + 4); + c++; + *c = data(d1Idx + 2, d2Idx + 4); + c++; + *c = data(d1Idx + 3, d2Idx + 4); + c++; + } else { + *c = data(d2Idx, d1Idx + 0); + c++; + *c = data(d2Idx, d1Idx + 1); + c++; + *c = data(d2Idx, d1Idx + 2); + c++; + *c = data(d2Idx, d1Idx + 3); + c++; + *c = data(d2Idx + 4, d1Idx + 0); + c++; + *c = data(d2Idx + 4, d1Idx + 1); + c++; + *c = data(d2Idx + 4, d1Idx + 2); + c++; + *c = data(d2Idx + 4, d1Idx + 3); + c++; + } + } + return c; + } +}; + template struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1> {