From ca0d3f92d7c04451fc428177aed2057138ef4a6c Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Thu, 22 Apr 2021 14:48:44 +0000 Subject: [PATCH] WIP --- Eigen/Core | 2 + Eigen/src/Core/arch/NEON/Kernels.h | 21 +++++ Eigen/src/Core/arch/NEON/MatrixProduct.h | 108 ++++++++++++++--------- Eigen/src/Core/arch/NEON/PackingOps.h | 21 +++++ 4 files changed, 108 insertions(+), 44 deletions(-) create mode 100644 Eigen/src/Core/arch/NEON/Kernels.h create mode 100644 Eigen/src/Core/arch/NEON/PackingOps.h diff --git a/Eigen/Core b/Eigen/Core index 5545fd011..cfc41f9e5 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -351,6 +351,8 @@ using std::ptrdiff_t; #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" #include "src/Core/arch/NEON/MatrixProduct.h" + #include "src/Core/arch/NEON/PackingOps.h" + #include "src/Core/arch/NEON/Kernels.h" #endif #include "src/Core/BooleanRedux.h" diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h new file mode 100644 index 000000000..2f16841b9 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -0,0 +1,21 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KERNELS_NEON_H +#define EIGEN_KERNELS_NEON_H + +namespace Eigen { + +namespace internal { + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_KERNELS_NEON_H \ No newline at end of file diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 7adc213c9..88968b4f0 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -592,7 +592,9 @@ struct PackingOperator { EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data) { +#ifdef __DEBUG__ std::cout << M << "x" << N << " ( " << d1Idx << ", " << d2Idx <<") -> ( " << d1Idx + M << ", " << d2Idx + N << ") "; +#endif Scalar *c = block; for(auto i = 0; i < M; i++) for(auto j = 0; j < N; j++) @@ -601,10 +603,14 @@ struct PackingOperator *c = data(d1Idx + i, d2Idx + j); else *c = data(d2Idx + j, d1Idx + i); +#ifdef __DEBUG__ std::cout << *c << " "; +#endif c++; } +#ifdef __DEBUG__ std::cout << std::endl; +#endif return c; } }; @@ -713,10 +719,12 @@ struct PackMap { const Scalar *pBase; const Scalar *pCur; + Index stride; + Index offset; Index d2Size; PackMapCalculator-1> pmc; - PackMap(const Scalar *base, Index d2Size) : pBase(base), pCur(base), d2Size(d2Size) {} + PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {} EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; } EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); } @@ -739,7 +747,8 @@ struct Accumulator } } - EIGEN_STRONG_INLINE void scale(ResScalar alpha) + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha) { for(auto i = 0; i < M; i++) { @@ -762,14 +771,15 @@ struct Accumulator } }; -template +template struct MicroKernel { - EIGEN_STRONG_INLINE void operator()(PackMap& lhsPackMap, - PackMap& rhsPackMap, + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) + Accumulator& acc) { +#ifdef __DEBUG__ std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl; std::cout << "LHS "; for(auto i = 0; i < M; i++) @@ -788,6 +798,7 @@ struct MicroKernel } } std::cout << std::endl; +#endif const RhsScalar *pRhs = rhsPackMap.pCur; for(auto i = 0; i < N; i++) { @@ -802,98 +813,104 @@ struct MicroKernel }; }; -template +template struct DepthLoopStruct { - DepthLoopStruct depthLS; - EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, const LhsScalar* blockA, const RhsScalar*blockB, - Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, PackMap& lhsPackMap, PackMap& rhsPackMap) + DepthLoopStruct depthLS; + EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, + Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) { constexpr auto rhsProgress = SHAPES[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION]; constexpr auto lhsProgress = SHAPES[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; constexpr auto depthProgress = SHAPES[IDX][SHAPES_DEP_DIMENSION]; + typedef Accumulator AccumulatorType; if(rhsProgress == SHAPES[IDX][SHAPES_RHS_DIMENSION] && lhsProgress == SHAPES[IDX][SHAPES_LHS_DIMENSION]) { - MicroKernel mkt; - Accumulator acc; + MicroKernel mkt; + AccumulatorType acc; acc.zero(); for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress) { mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc); } - acc.scale(alpha); + acc.scale(alpha, pAlpha); acc.store(res, rowIdx, colIdx); } - depthLS(rowIdx, colIdx, depthIdx, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + depthLS(rowIdx, colIdx, depthIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } }; -template -struct DepthLoopStruct +template +struct DepthLoopStruct { - EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&, const LhsScalar*, const RhsScalar*, - Index, Index, Index, ResScalar, Index, Index, Index, Index, PackMap&, PackMap&) {} + EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&, + Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {} }; -template +template struct LhsLoopStruct { - LhsLoopStruct lhsLS; - EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res, const LhsScalar* blockA, const RhsScalar*blockB, - Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, PackMap& lhsPackMap, PackMap& rhsPackMap) + LhsLoopStruct lhsLS; + EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res, + Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) { constexpr auto lhsProgress = SHAPES[IDX][SHAPES_LHS_DIMENSION]; - DepthLoopStruct depthLS; + DepthLoopStruct depthLS; for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress) { lhsPackMap.moveTo(rowIdx); rhsPackMap.moveTo(colIdx); - depthLS(rowIdx, colIdx, 0, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } - lhsLS(rowIdx, colIdx, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } }; -template -struct LhsLoopStruct +template +struct LhsLoopStruct { - EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&, const LhsScalar*, const RhsScalar*, - Index, Index, Index, ResScalar, Index, Index, Index, Index, PackMap&, PackMap&) {} + EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&, + Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {} }; -template +template struct RhsLoopStruct { static constexpr auto PREVIOUS = SHAPES[IDX][SHAPES_POINTER]; - RhsLoopStruct rhsLS; + RhsLoopStruct rhsLS; - EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res, const LhsScalar* blockA, const RhsScalar*blockB, - Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, PackMap& lhsPackMap, PackMap& rhsPackMap) + EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res, + Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) { constexpr auto rhsProgress = SHAPES[IDX][SHAPES_RHS_DIMENSION]; for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress) { - LhsLoopStruct lhsLS; - lhsLS(0, colIdx, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + LhsLoopStruct lhsLS; + lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } - rhsLS(colIdx, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } }; -template -struct RhsLoopStruct +template +struct RhsLoopStruct { - EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&, const LhsScalar*, const RhsScalar*, - Index, Index, Index, ResScalar, Index, Index, Index, Index, PackMap&, PackMap&) {} + EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&, + Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {} }; template EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { + using ResPacket = typename unpacket_traits::type; + typedef PackMap LhsPackMap; + typedef PackMap RhsPackMap; + +#ifdef __DEBUG__ std::cout << "blockA" << std::endl; for(auto i = 0; i < rows*depth; i++) { @@ -910,11 +927,14 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, co std::cout << blockB[i] << " "; } std::cout << std::endl; +#endif + RhsLoopStruct-1> rhsLS; + LhsPackMap lhsPackMap(blockA, depth, strideA, offsetA); + RhsPackMap rhsPackMap(blockB, depth, strideB, offsetB); - RhsLoopStruct-1> rhsLS; - PackMap lhsPackMap(blockA, depth); - PackMap rhsPackMap(blockB, depth); - rhsLS(0, res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, lhsPackMap, rhsPackMap); + ResPacket pAlpha = pset1(alpha); + + rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } template diff --git a/Eigen/src/Core/arch/NEON/PackingOps.h b/Eigen/src/Core/arch/NEON/PackingOps.h new file mode 100644 index 000000000..72edb800f --- /dev/null +++ b/Eigen/src/Core/arch/NEON/PackingOps.h @@ -0,0 +1,21 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKING_OPS_NEON_H +#define EIGEN_PACKING_OPS_NEON_H + +namespace Eigen { + +namespace internal { + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKING_OPS_NEON_H \ No newline at end of file