From 6533187280aefe7c9b331d9c04e96ddd929eb226 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Tue, 18 May 2021 20:42:08 +0000 Subject: [PATCH] WIP 2 - need to implement 2x1x1 --- Eigen/src/Core/arch/NEON/Kernels.h | 120 ++++++++++++++++++++++- Eigen/src/Core/arch/NEON/MatrixProduct.h | 59 ++++++----- new_gemm_test.cpp | 6 +- run.sh | 8 +- 4 files changed, 167 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 19bd38fd0..79f19e477 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -74,6 +74,21 @@ namespace internal { lhsPackMap.advance(4*1); \ rhsPackMap.advance(1*4); +#define MICRO_2x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs = pload(rhsPackMap.pCur + 2); \ + pRhs2 = pset1(pRhs[0]); \ + pRhs3 = pset1(pRhs[1]); \ + acc._acc.packet[0] += pLhs*pRhs0; \ + acc._acc.packet[1] += pLhs*pRhs1; \ + acc._acc.packet[2] += pLhs*pRhs2; \ + acc._acc.packet[3] += pLhs*pRhs3; \ + lhsPackMap.advance(2*1); \ + rhsPackMap.advance(1*4); + #define MICRO_12x1x1(K) \ pLhs = pload(lhsPackMap.pCur + (0 + 3*K)*4); \ pLhs2 = pload(lhsPackMap.pCur + (1 + 3*K)*4); \ @@ -116,6 +131,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1> _acc.packet[2] = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template @@ -156,6 +172,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> _acc.packet[1] = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template @@ -189,7 +206,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> { _acc = pset1(0); } - + + template EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template @@ -221,6 +239,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> _acc = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template @@ -237,6 +256,80 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> } }; +//[TODO] Implement this properly +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 2, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::half; + using ResPacket = typename packet_traits::type; + + LinearMapper r0{nullptr}; + LinearMapper r1{nullptr}; + LinearMapper r2{nullptr}; + LinearMapper r3{nullptr}; + + PacketBlock _acc; + + EIGEN_STRONG_INLINE void zero() + { + _acc.packet[0] = pset1(0); + _acc.packet[1] = pset1(0); + _acc.packet[2] = pset1(0); + _acc.packet[3] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) + { + asm __volatile__("#BEGIN_PREFETCH_2x4\n\t"); + r0 = dest.getLinearMapper(row + 0, col + 0); + r1 = dest.getLinearMapper(row + 0, col + 1); + r2 = dest.getLinearMapper(row + 0, col + 2); + r3 = dest.getLinearMapper(row + 0, col + 3); + +#ifdef __ENABLE_PREFETCH__ + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); +#endif + asm __volatile__("#END_PREFETCH_2x4\n\t"); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + // _acc.packet[0] *= pAlpha; + // _acc.packet[1] *= pAlpha; + // _acc.packet[2] *= pAlpha; + // _acc.packet[3] *= pAlpha; + } + + template + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha) + { + asm __volatile__("#BEGIN_STORE_2x4\n\t"); + constexpr auto PacketSize = unpacket_traits::size; + AccPacket ppAlpha = pset1(alpha); + AccPacket R00 = r0.template loadPacket(0*PacketSize); + AccPacket R01 = r1.template loadPacket(0*PacketSize); + AccPacket R02 = r2.template loadPacket(0*PacketSize); + AccPacket R03 = r3.template loadPacket(0*PacketSize); + + R00 += ppAlpha*_acc.packet[0]; + R01 += ppAlpha*_acc.packet[1]; + R02 += ppAlpha*_acc.packet[2]; + R03 += ppAlpha*_acc.packet[3]; + + r0.storePacket(0*PacketSize, R00); + r1.storePacket(0*PacketSize, R01); + r2.storePacket(0*PacketSize, R02); + r3.storePacket(0*PacketSize, R03); + asm __volatile__("#END_STORE_2x4\n\t"); + } +}; + template struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> { @@ -259,6 +352,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> _acc.packet[3] = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) { asm __volatile__("#BEGIN_PREFETCH_4x4\n\t"); @@ -336,6 +430,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> _acc2.packet[3] = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) { constexpr Index offset = 32 / sizeof(ResScalar); @@ -437,6 +532,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4> _acc3.packet[3] = pset1(0); } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) { asm __volatile__("#BEGIN_PREFETCH_12x4\n\t"); @@ -710,6 +806,28 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, }; }; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 2, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::half; + using RhsPacket = typename packet_traits::half; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_2x1x4\n\t"); + + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_2x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_2x1x4\n\t"); + }; +}; + template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1> { diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 36f441e94..9b37d7098 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -23,7 +23,7 @@ namespace internal { #endif template -constexpr int SHAPES_COUNT = 14; +constexpr int SHAPES_COUNT = 16; constexpr int SHAPES_DIMENSION = 6; constexpr int SHAPES_LHS_DIMENSION = 0; @@ -44,23 +44,35 @@ constexpr int PACK_SHAPES_DIMENSION = 3; constexpr int PACK_SHAPES_POINTER = 2; constexpr int PACK_SHAPES_END = -1; +template +struct PacketMultiples +{ + enum + { + half = unpacket_traits::half>::size, + quarter = unpacket_traits::half>::size // Is this used? + }; +}; + // lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map template constexpr int SHAPES[SHAPES_COUNT][SHAPES_DIMENSION] = { /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /* 01 */{1*packet_traits::size, 1,1, 0, 0, SHAPES_POINTER_END}, - /* 02 */{1*packet_traits::size,__UNROLL__,1, 0, 0, 1}, - /* 03 */{2*packet_traits::size, 1,1, 0, 2, SHAPES_POINTER_END}, - /* 04 */{2*packet_traits::size,__UNROLL__,1, 0, 2, 3}, - /* 05 */{3*packet_traits::size, 1,1, 0, 4, SHAPES_POINTER_END}, - /* 06 */{3*packet_traits::size,__UNROLL__,1, 0, 4, 5}, - /* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /* 08 */{1*packet_traits::size, 1,4, 6, 7, SHAPES_POINTER_END}, - /* 09 */{1*packet_traits::size,__UNROLL__,4, 6, 7, 8}, - /* 10 */{2*packet_traits::size, 1,4, 6, 9, SHAPES_POINTER_END}, - /* 11 */{2*packet_traits::size,__UNROLL__,4, 6, 9, 10}, - /* 12 */{3*packet_traits::size, 1,4, 6, 11, SHAPES_POINTER_END}, - /* 13 */{3*packet_traits::size,__UNROLL__,4, 6, 11, 12}}; + /* 01 */{PacketMultiples::half, 1,1, 0, 0, SHAPES_POINTER_END}, + /* 02 */{1*packet_traits::size, 1,1, 0, 1, SHAPES_POINTER_END}, + /* 03 */{1*packet_traits::size,__UNROLL__,1, 0, 1, 2}, + /* 04 */{2*packet_traits::size, 1,1, 0, 3, SHAPES_POINTER_END}, + /* 05 */{2*packet_traits::size,__UNROLL__,1, 0, 3, 4}, + /* 06 */{3*packet_traits::size, 1,1, 0, 5, SHAPES_POINTER_END}, + /* 07 */{3*packet_traits::size,__UNROLL__,1, 0, 5, 6}, + /* 08 */{ 1, 1,4, 7, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 09 */{PacketMultiples::half, 1,4, 7, 8, SHAPES_POINTER_END}, + /* 10 */{1*packet_traits::size, 1,4, 7, 9, SHAPES_POINTER_END}, + /* 11 */{1*packet_traits::size,__UNROLL__,4, 7, 9, 10}, + /* 12 */{2*packet_traits::size, 1,4, 7, 11, SHAPES_POINTER_END}, + /* 13 */{2*packet_traits::size,__UNROLL__,4, 7, 11, 12}, + /* 14 */{3*packet_traits::size, 1,4, 7, 13, SHAPES_POINTER_END}, + /* 15 */{3*packet_traits::size,__UNROLL__,4, 7, 13, 14}}; // d1progress x d2progress template @@ -218,6 +230,8 @@ struct PackMap EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; } EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); } EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; } + + template EIGEN_STRONG_INLINE void prefetch(Index amnt) { #ifdef __ENABLE_PREFETCH__ @@ -242,6 +256,7 @@ struct Accumulator } } + template EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template @@ -321,9 +336,9 @@ struct DepthLoopStruct EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) { - constexpr auto rhsProgress = SHAPES[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION]; - constexpr auto lhsProgress = SHAPES[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; - constexpr auto depthProgress = SHAPES[IDX][SHAPES_DEP_DIMENSION]; + constexpr int rhsProgress = SHAPES[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION]; + constexpr int lhsProgress = SHAPES[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; + constexpr int depthProgress = SHAPES[IDX][SHAPES_DEP_DIMENSION]; typedef Accumulator AccumulatorType; @@ -332,11 +347,10 @@ struct DepthLoopStruct acc.zero(); - acc.prefetch(res, rowIdx, colIdx); + acc.template prefetch(res, rowIdx, colIdx); - lhsPackMap.prefetch(0); - if(rhsProgress > 1) - rhsPackMap.prefetch(0); + lhsPackMap.template prefetch(0); + rhsPackMap.template prefetch(0); for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress) { @@ -367,9 +381,10 @@ struct LhsLoopStruct constexpr auto lhsProgress = SHAPES[IDX][SHAPES_LHS_DIMENSION]; constexpr auto rhsProgress = SHAPES[IDX][SHAPES_RHS_DIMENSION]; DepthLoopStruct depthLS; - rhsPackMap.resetCur(); + //rhsPackMap.resetCur(); for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress) { + rhsPackMap.resetCur(); //lhsPackMap.moveTo(rowIdx); //rhsPackMap.moveTo(colIdx); diff --git a/new_gemm_test.cpp b/new_gemm_test.cpp index 42d60d8cf..b936924ca 100644 --- a/new_gemm_test.cpp +++ b/new_gemm_test.cpp @@ -15,7 +15,8 @@ void set(MatrixXf& A, int m, int n, int id, int digits) int main(int argc, char* argv[]) { #ifdef __DEBUG__ - int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n); + int m = std::atoi(argv[1]), k = std::atoi(argv[1]), n = std::atoi(argv[1]); + int max = std::max(std::max(m,k),n); MatrixXf A = MatrixXf::Zero(m, k); MatrixXf B = MatrixXf::Zero(k, n); MatrixXf C = MatrixXf::Zero(m, n); @@ -24,7 +25,8 @@ int main(int argc, char* argv[]) set(A, m, k, 1, static_cast(std::log10(max)) + 1); set(B, k, n, 2, static_cast(std::log10(max)) + 1); - C = A*B; + for(auto i = 0; i < 2; i++) + C = A*B; std::cout << A << std::endl; std::cout << B << std::endl; diff --git a/run.sh b/run.sh index 80633c689..f560695ec 100755 --- a/run.sh +++ b/run.sh @@ -27,8 +27,14 @@ function run() { } run $1 16 500 +run $1 21 500 run $1 32 500 +run $1 53 500 run $1 64 100 +run $1 97 100 run $1 128 50 +run $1 203 50 run $1 256 10 -run $1 1024 10 +run $1 673 10 +run $1 1024 5 +run $1 2048 2