From 58db05afbc36e27f8b683fc6da3d4d0aa5afaa20 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Thu, 13 May 2021 15:30:08 +0000 Subject: [PATCH] WIP 2 --- Eigen/src/Core/arch/NEON/Kernels.h | 164 ++++++++++++++++++----- Eigen/src/Core/arch/NEON/MatrixProduct.h | 23 ++-- new_gemm_test.cpp | 3 - run.sh | 7 +- 4 files changed, 150 insertions(+), 47 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index ef5591ea1..b01476a6a 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -17,30 +17,30 @@ namespace internal { #ifdef __ENABLE_VECTOR_KERNELS__ #define MICRO_12x1x4() \ - pLhs = pload(lhsPackMap.pCur); \ - lhsPackMap.advance(4*1); \ - pLhs2 = pload(lhsPackMap.pCur); \ - lhsPackMap.advance(4*1); \ - pLhs3 = pload(lhsPackMap.pCur); \ pRhs = pload(rhsPackMap.pCur); \ + rhsPackMap.advance(1*4); \ pRhs0 = pset1(pRhs[0]); \ pRhs1 = pset1(pRhs[1]); \ pRhs2 = pset1(pRhs[2]); \ pRhs3 = pset1(pRhs[3]); \ + pLhs = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ acc._acc1.packet[0] += pLhs*pRhs0; \ acc._acc1.packet[1] += pLhs*pRhs1; \ acc._acc1.packet[2] += pLhs*pRhs2; \ acc._acc1.packet[3] += pLhs*pRhs3; \ + pLhs2 = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ acc._acc2.packet[0] += pLhs2*pRhs0; \ acc._acc2.packet[1] += pLhs2*pRhs1; \ acc._acc2.packet[2] += pLhs2*pRhs2; \ acc._acc2.packet[3] += pLhs2*pRhs3; \ + pLhs3 = pload(lhsPackMap.pCur); \ acc._acc3.packet[0] += pLhs3*pRhs0; \ acc._acc3.packet[1] += pLhs3*pRhs1; \ acc._acc3.packet[2] += pLhs3*pRhs2; \ acc._acc3.packet[3] += pLhs3*pRhs3; \ - lhsPackMap.advance(4*1); \ - rhsPackMap.advance(1*4); + lhsPackMap.advance(4*1); #define MICRO_8x1x4() \ pLhs = pload(lhsPackMap.pCur); \ @@ -76,6 +76,36 @@ namespace internal { lhsPackMap.advance(4*1); \ rhsPackMap.advance(1*4); +#define MICRO_12x1x1() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pset1(*rhsPackMap.pCur); \ + acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \ + lhsPackMap.advance(4*1); \ + pLhs = pload(lhsPackMap.pCur); \ + acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \ + lhsPackMap.advance(4*1); \ + pLhs = pload(lhsPackMap.pCur); \ + acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1); + +#define MICRO_8x1x1() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pset1(*rhsPackMap.pCur); \ + acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \ + lhsPackMap.advance(4*1); \ + pLhs = pload(lhsPackMap.pCur); \ + acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1); + +#define MICRO_4x1x1() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pset1(*rhsPackMap.pCur); \ + acc._acc += pRhs*pLhs; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1); + template struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1> { @@ -581,6 +611,35 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, }; }; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + LhsPacket pLhs; + RhsPacket pRhs; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + MICRO_12x1x1(); + + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; + template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1> { @@ -592,21 +651,42 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, using LhsPacket = typename packet_traits::type; using RhsPacket = typename packet_traits::type; + LhsPacket pLhs; + RhsPacket pRhs; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + + MICRO_12x1x1(); + + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pset1(*rhsPackMap.pCur); + LhsPacket pLhs; + RhsPacket pRhs; - acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); - lhsPackMap.advance(4*1); - pLhs = pload(lhsPackMap.pCur); - acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); - lhsPackMap.advance(4*1); - pLhs = pload(lhsPackMap.pCur); - acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); + MICRO_8x1x1(); - lhsPackMap.advance(4*1); - rhsPackMap.advance(1); asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); }; }; @@ -624,16 +704,40 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pset1(*rhsPackMap.pCur); + LhsPacket pLhs; + RhsPacket pRhs; - acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); - lhsPackMap.advance(4*1); - pLhs = pload(lhsPackMap.pCur); - acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); + MICRO_8x1x1(); + + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + + LhsPacket pLhs; + RhsPacket pRhs; + + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); + MICRO_4x1x1(); - lhsPackMap.advance(4*1); - rhsPackMap.advance(1); asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); }; }; @@ -651,13 +755,11 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pset1(*rhsPackMap.pCur); + LhsPacket pLhs; + RhsPacket pRhs; - acc._acc += pRhs*pLhs; + MICRO_4x1x1(); - lhsPackMap.advance(4*1); - rhsPackMap.advance(1); asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); }; }; diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 3e509f29e..f478ea15e 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -23,7 +23,7 @@ namespace internal { #endif template -constexpr int SHAPES_COUNT = 11; +constexpr int SHAPES_COUNT = 14; constexpr int SHAPES_DIMENSION = 6; constexpr int SHAPES_LHS_DIMENSION = 0; @@ -49,15 +49,18 @@ template constexpr int SHAPES[SHAPES_COUNT][SHAPES_DIMENSION] = { /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, /* 01 */{1*packet_traits::size, 1,1, 0, 0, SHAPES_POINTER_END}, - /* 02 */{2*packet_traits::size, 1,1, 0, 1, SHAPES_POINTER_END}, - /* 03 */{3*packet_traits::size, 1,1, 0, 2, SHAPES_POINTER_END}, - /* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /* 05 */{1*packet_traits::size, 1,4, 3, 4, SHAPES_POINTER_END}, - /* 06 */{1*packet_traits::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END}, - /* 07 */{2*packet_traits::size, 1,4, 3, 6, SHAPES_POINTER_END}, - /* 08 */{2*packet_traits::size,__UNROLL__,4, 3, 6, 7}, - /* 09 */{3*packet_traits::size, 1,4, 3, 8, SHAPES_POINTER_END}, - /* 10 */{3*packet_traits::size,__UNROLL__,4, 3, 8, 9}}; + /* 02 */{1*packet_traits::size,__UNROLL__,1, 0, 0, 1}, + /* 03 */{2*packet_traits::size, 1,1, 0, 2, SHAPES_POINTER_END}, + /* 04 */{2*packet_traits::size,__UNROLL__,1, 0, 2, 3}, + /* 05 */{3*packet_traits::size, 1,1, 0, 4, SHAPES_POINTER_END}, + /* 06 */{3*packet_traits::size,__UNROLL__,1, 0, 4, 5}, + /* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 08 */{1*packet_traits::size, 1,4, 6, 7, SHAPES_POINTER_END}, + /* 09 */{1*packet_traits::size,__UNROLL__,4, 6, 7, 8}, + /* 10 */{2*packet_traits::size, 1,4, 6, 9, SHAPES_POINTER_END}, + /* 11 */{2*packet_traits::size,__UNROLL__,4, 6, 9, 10}, + /* 12 */{3*packet_traits::size, 1,4, 6, 11, SHAPES_POINTER_END}, + /* 13 */{3*packet_traits::size,__UNROLL__,4, 6, 11, 12}}; // d1progress x d2progress template diff --git a/new_gemm_test.cpp b/new_gemm_test.cpp index f6fbbca48..dabb33d02 100644 --- a/new_gemm_test.cpp +++ b/new_gemm_test.cpp @@ -28,7 +28,6 @@ int main(int argc, char* argv[]) std::cout << A << std::endl; std::cout << B << std::endl; - std::cout << C << std::endl; std::cout << std::endl; @@ -48,8 +47,6 @@ int main(int argc, char* argv[]) } } } - - std::cout << C << std::endl; #else if(argc < 3) { diff --git a/run.sh b/run.sh index fd17888fe..063aee907 100755 --- a/run.sh +++ b/run.sh @@ -28,6 +28,7 @@ function run() { run $1 16 500 run $1 32 500 -run $1 64 500 -run $1 128 100 -run $1 256 50 +run $1 64 100 +run $1 128 50 +run $1 256 10 +run $1 1024 10