From d216764f46998ecf5ce56b4f2ee3bfb632c44757 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Fri, 23 Apr 2021 17:28:17 +0000 Subject: [PATCH] WIP --- Eigen/src/Core/arch/NEON/Kernels.h | 91 ++++++++++++++++++++---- Eigen/src/Core/arch/NEON/MatrixProduct.h | 2 +- 2 files changed, 78 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 3f6f25d4e..173a8a7c9 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -15,9 +15,8 @@ namespace Eigen { namespace internal { template -constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 7; +constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 8; -// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map template constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = { {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, @@ -25,8 +24,9 @@ constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScala {1,1,4, 1, SHAPES_POINTER_END, SHAPES_POINTER_END}, {4,1,4, 1, 2, SHAPES_POINTER_END}, {4,4,4, 1, 2, 3}, + {4,8,4, 1, 2, 4}, {8,1,4, 1, 4, SHAPES_POINTER_END}, - {8,4,4, 1, 4, 5}}; + {8,4,4, 1, 4, 6}}; template struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> @@ -63,25 +63,29 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> using AccPacket = typename packet_traits::type; using ResPacket = typename packet_traits::type; - AccPacket _acc; + PacketBlock _acc; EIGEN_STRONG_INLINE void zero() { - _acc = pset1(0); + _acc.packet[0] = pset1(0); } template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { - _acc *= pAlpha; + _acc.packet[0] *= pAlpha; } EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) { - dest(row, col + 0) = _acc[0]; - dest(row, col + 1) = _acc[1]; - dest(row, col + 2) = _acc[2]; - dest(row, col + 3) = _acc[3]; + // [TODO] Check this + //_acc.packet[0] += dest.template loadPacket(row, col); + //dest.template storePacketBlock(row, col, _acc); + + dest(row, col + 0) += _acc.packet[0][0]; + dest(row, col + 1) += _acc.packet[0][1]; + dest(row, col + 2) += _acc.packet[0][2]; + dest(row, col + 3) += _acc.packet[0][3]; } }; @@ -207,6 +211,35 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> lhsPackMap.advance(4*1); \ rhsPackMap.advance(1*4); +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); + + LhsPacket pLhs, pLhs2; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); + }; +}; + template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> { @@ -268,6 +301,34 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, lhsPackMap.advance(4*1); \ rhsPackMap.advance(1*4); +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); + }; +}; + template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4> { @@ -322,13 +383,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, Accumulator& acc) { using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsScalar rhs = *rhsPackMap.pCur; + RhsPacket pRhs = pset1(*rhsPackMap.pCur); - acc._acc += pLhs*rhs; + acc._acc += pLhs*pRhs; lhsPackMap.advance(4*1); rhsPackMap.advance(1); @@ -345,13 +407,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, Accumulator& acc) { using RhsPacket = typename packet_traits::type; + using LhsPacket = typename packet_traits::type; asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t"); RhsPacket pRhs = pload(rhsPackMap.pCur); - LhsScalar lhs = *lhsPackMap.pCur; + LhsPacket pLhs = pset1(*lhsPackMap.pCur); - acc._acc += pRhs*lhs; + acc._acc.packet[0] += pRhs*pLhs; lhsPackMap.advance(1); rhsPackMap.advance(4*1); diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index f766a6427..61af4e911 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -226,7 +226,7 @@ struct Accumulator { for(auto j = 0; j < N; j++) { - dest(row + i, col + j) = dt[i][j]; + dest(row + i, col + j) += dt[i][j]; } } }