From 70c0363c281dbdf9f1ffc35aae47d6c1fa7e9ee0 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Mon, 10 May 2021 19:59:47 +0000 Subject: [PATCH] WIP2 --- Eigen/src/Core/arch/NEON/Kernels.h | 952 +++++++++++------------ Eigen/src/Core/arch/NEON/MatrixProduct.h | 8 +- Eigen/src/Core/arch/NEON/PackingOps.h | 50 +- 3 files changed, 518 insertions(+), 492 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 781796875..011e5d866 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -14,494 +14,470 @@ namespace Eigen { namespace internal { -template -constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9; - -template -constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = - { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END}, - /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END}, - /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END}, - /*5*/ {4,4,4, 2, 2, 3}, - /*6*/ {4,8,4, 2, 2, 4}, - /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END}, - /*8*/ {8,4,4, 2, 4, 6}}; - -template -struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> -{ - using LinearMapper = typename DataMapper::LinearMapper; - using AccPacket = typename packet_traits::type; - using ResPacket = typename packet_traits::type; - - PacketBlock _acc; - - EIGEN_STRONG_INLINE void zero() - { - _acc.packet[0] = pset1(0); - _acc.packet[1] = pset1(0); - } - - template - EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) - { - _acc.packet[0] *= pAlpha; - _acc.packet[1] *= pAlpha; - } - - EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) - { - //eigen_assert(false && "4x1"); - //LinearMapper r0 = dest.getLinearMapper(row, col + 0); - - //r0.storePacket(0, r0.template loadPacket(0) + _acc); - PacketBlock block; - block.packet[0] = dest.template loadPacket(row, col) + _acc.packet[0]; - dest.template storePacketBlock(row, col, block); - block.packet[0] = dest.template loadPacket(row + 4, col) + _acc.packet[1]; - dest.template storePacketBlock(row + 4, col, block); - /* - dest(row + 0, col + 0) += _acc.packet[0][0]; - dest(row + 1, col + 0) += _acc.packet[0][1]; - dest(row + 2, col + 0) += _acc.packet[0][2]; - dest(row + 3, col + 0) += _acc.packet[0][3]; - - dest(row + 4, col + 0) += _acc.packet[1][0]; - dest(row + 5, col + 0) += _acc.packet[1][1]; - dest(row + 6, col + 0) += _acc.packet[1][2]; - dest(row + 7, col + 0) += _acc.packet[1][3]; - */ - } -}; - -template -struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> -{ - using LinearMapper = typename DataMapper::LinearMapper; - using AccPacket = typename packet_traits::type; - using ResPacket = typename packet_traits::type; - - AccPacket _acc; - - EIGEN_STRONG_INLINE void zero() - { - _acc = pset1(0); - } - - template - EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) - { - _acc *= pAlpha; - } - - EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) - { - //eigen_assert(false && "4x1"); - //LinearMapper r0 = dest.getLinearMapper(row, col + 0); - - //r0.storePacket(0, r0.template loadPacket(0) + _acc); - PacketBlock block; - block.packet[0] = dest.template loadPacket(row, col) + _acc; - dest.template storePacketBlock(row, col, block); - /* - dest(row + 0, col) += _acc[0]; - dest(row + 1, col) += _acc[1]; - dest(row + 2, col) += _acc[2]; - dest(row + 3, col) += _acc[3]; - */ - } -}; - -template -struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> -{ - using LinearMapper = typename DataMapper::LinearMapper; - using AccPacket = typename packet_traits::type; - using ResPacket = typename packet_traits::type; - - AccPacket _acc; - - EIGEN_STRONG_INLINE void zero() - { - _acc = pset1(0); - } - - template - EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) - { - _acc *= pAlpha; - } - - EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) - { - ResPacket r = dest.template gatherPacket(row, col) + _acc; - dest.template scatterPacket(row, col, r); - } -}; - -template -struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> -{ - using LinearMapper = typename DataMapper::LinearMapper; - using AccPacket = typename packet_traits::type; - using ResPacket = typename packet_traits::type; - - PacketBlock _acc; - - EIGEN_STRONG_INLINE void zero() - { - _acc.packet[0] = pset1(0); - _acc.packet[1] = pset1(0); - _acc.packet[2] = pset1(0); - _acc.packet[3] = pset1(0); - } - - template - EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) - { - _acc.packet[0] *= pAlpha; - _acc.packet[1] *= pAlpha; - _acc.packet[2] *= pAlpha; - _acc.packet[3] *= pAlpha; - } - - EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) - { - LinearMapper r0 = dest.getLinearMapper(row, col + 0); - LinearMapper r1 = dest.getLinearMapper(row, col + 1); - LinearMapper r2 = dest.getLinearMapper(row, col + 2); - LinearMapper r3 = dest.getLinearMapper(row, col + 3); - - r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); - r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); - r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); - r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); - } -}; - -template -struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> -{ - using LinearMapper = typename DataMapper::LinearMapper; - using AccPacket = typename packet_traits::type; - using ResPacket = typename packet_traits::type; - - PacketBlock _acc1; - PacketBlock _acc2; - - EIGEN_STRONG_INLINE void zero() - { - _acc1.packet[0] = pset1(0); - _acc1.packet[1] = pset1(0); - _acc1.packet[2] = pset1(0); - _acc1.packet[3] = pset1(0); - - _acc2.packet[0] = pset1(0); - _acc2.packet[1] = pset1(0); - _acc2.packet[2] = pset1(0); - _acc2.packet[3] = pset1(0); - } - - template - EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) - { - _acc1.packet[0] *= pAlpha; - _acc1.packet[1] *= pAlpha; - _acc1.packet[2] *= pAlpha; - _acc1.packet[3] *= pAlpha; - - _acc2.packet[0] *= pAlpha; - _acc2.packet[1] *= pAlpha; - _acc2.packet[2] *= pAlpha; - _acc2.packet[3] *= pAlpha; - } - - EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) - { - LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); - LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); - LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); - LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); - - LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); - LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); - LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); - LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); - - - r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); - r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); - r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); - r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); - - r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); - r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); - r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); - r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); - } -}; - -#define MICRO_8x1x4() \ - pLhs = pload(lhsPackMap.pCur); \ - lhsPackMap.advance(4*1); \ - pLhs2 = pload(lhsPackMap.pCur); \ - pRhs = pload(rhsPackMap.pCur); \ - pRhs0 = pset1(pRhs[0]); \ - pRhs1 = pset1(pRhs[1]); \ - pRhs2 = pset1(pRhs[2]); \ - pRhs3 = pset1(pRhs[3]); \ - acc._acc1.packet[0] += pLhs*pRhs0; \ - acc._acc1.packet[1] += pLhs*pRhs1; \ - acc._acc1.packet[2] += pLhs*pRhs2; \ - acc._acc1.packet[3] += pLhs*pRhs3; \ - acc._acc2.packet[0] += pLhs2*pRhs0; \ - acc._acc2.packet[1] += pLhs2*pRhs1; \ - acc._acc2.packet[2] += pLhs2*pRhs2; \ - acc._acc2.packet[3] += pLhs2*pRhs3; \ - lhsPackMap.advance(4*1); \ - rhsPackMap.advance(1*4); - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); - - LhsPacket pLhs, pLhs2; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - - asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t"); - - LhsPacket pLhs, pLhs2; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - MICRO_8x1x4(); - - asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); - - LhsPacket pLhs, pLhs2; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_8x1x4(); - - asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); - }; -}; - -#define MICRO_4x1x4() \ - pLhs = pload(lhsPackMap.pCur); \ - pRhs = pload(rhsPackMap.pCur); \ - pRhs0 = pset1(pRhs[0]); \ - pRhs1 = pset1(pRhs[1]); \ - pRhs2 = pset1(pRhs[2]); \ - pRhs3 = pset1(pRhs[3]); \ - acc._acc.packet[0] += pLhs*pRhs0; \ - acc._acc.packet[1] += pLhs*pRhs1; \ - acc._acc.packet[2] += pLhs*pRhs2; \ - acc._acc.packet[3] += pLhs*pRhs3; \ - lhsPackMap.advance(4*1); \ - rhsPackMap.advance(1*4); - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); - LhsPacket pLhs; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - - asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); - LhsPacket pLhs; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - MICRO_4x1x4(); - asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t"); - - LhsPacket pLhs; - RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - - MICRO_4x1x4(); - - asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); - - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pset1(*rhsPackMap.pCur); - - acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); - lhsPackMap.advance(4*1); - pLhs = pload(lhsPackMap.pCur); - acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); - - lhsPackMap.advance(4*1); - rhsPackMap.advance(1); - asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using LhsPacket = typename packet_traits::type; - using RhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); - - LhsPacket pLhs = pload(lhsPackMap.pCur); - RhsPacket pRhs = pset1(*rhsPackMap.pCur); - - //acc._acc += pRhs*pLhs; - acc._acc = pmadd(pRhs, pLhs, acc._acc); - - lhsPackMap.advance(4*1); - rhsPackMap.advance(1); - asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); - }; -}; - -template -struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4> -{ - EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, - RhsPackMap& rhsPackMap, - Index rowIdx, Index colIdx, Index depthIdx, - Accumulator& acc) - { - using RhsPacket = typename packet_traits::type; - using LhsPacket = typename packet_traits::type; - - asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t"); - - LhsPacket pLhs = pset1(*lhsPackMap.pCur); - RhsPacket pRhs = pload(rhsPackMap.pCur); - - acc._acc += pLhs*pRhs; - - lhsPackMap.advance(1); - rhsPackMap.advance(4*1); - asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t"); - }; -}; - +// template +// constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9; + +// template +// constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = +// { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, +// /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END}, +// /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END}, +// /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END}, +// /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END}, +// /*5*/ {4,4,4, 2, 2, 3}, +// /*6*/ {4,8,4, 2, 2, 4}, +// /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END}, +// /*8*/ {8,4,4, 2, 4, 6}}; + +// template +// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> +// { +// using LinearMapper = typename DataMapper::LinearMapper; +// using AccPacket = typename packet_traits::type; +// using ResPacket = typename packet_traits::type; + +// PacketBlock _acc; + +// EIGEN_STRONG_INLINE void zero() +// { +// _acc.packet[0] = pset1(0); +// _acc.packet[1] = pset1(0); +// } + +// template +// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) +// { +// _acc.packet[0] *= pAlpha; +// _acc.packet[1] *= pAlpha; +// } + +// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) +// { +// PacketBlock block; +// block.packet[0] = dest.template loadPacket(row, col) + _acc.packet[0]; +// dest.template storePacketBlock(row, col, block); +// block.packet[0] = dest.template loadPacket(row + 4, col) + _acc.packet[1]; +// dest.template storePacketBlock(row + 4, col, block); +// } +// }; + +// template +// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> +// { +// using LinearMapper = typename DataMapper::LinearMapper; +// using AccPacket = typename packet_traits::type; +// using ResPacket = typename packet_traits::type; + +// AccPacket _acc; + +// EIGEN_STRONG_INLINE void zero() +// { +// _acc = pset1(0); +// } + +// template +// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) +// { +// _acc *= pAlpha; +// } + +// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) +// { +// PacketBlock block; +// block.packet[0] = dest.template loadPacket(row, col) + _acc; +// dest.template storePacketBlock(row, col, block); +// } +// }; + +// template +// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> +// { +// using LinearMapper = typename DataMapper::LinearMapper; +// using AccPacket = typename packet_traits::type; +// using ResPacket = typename packet_traits::type; + +// AccPacket _acc; + +// EIGEN_STRONG_INLINE void zero() +// { +// _acc = pset1(0); +// } + +// template +// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) +// { +// _acc *= pAlpha; +// } + +// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) +// { +// ResPacket r = dest.template gatherPacket(row, col) + _acc; +// dest.template scatterPacket(row, col, r); +// } +// }; + +// template +// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> +// { +// using LinearMapper = typename DataMapper::LinearMapper; +// using AccPacket = typename packet_traits::type; +// using ResPacket = typename packet_traits::type; + +// PacketBlock _acc; + +// EIGEN_STRONG_INLINE void zero() +// { +// _acc.packet[0] = pset1(0); +// _acc.packet[1] = pset1(0); +// _acc.packet[2] = pset1(0); +// _acc.packet[3] = pset1(0); +// } + +// template +// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) +// { +// _acc.packet[0] *= pAlpha; +// _acc.packet[1] *= pAlpha; +// _acc.packet[2] *= pAlpha; +// _acc.packet[3] *= pAlpha; +// } + +// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) +// { +// LinearMapper r0 = dest.getLinearMapper(row, col + 0); +// LinearMapper r1 = dest.getLinearMapper(row, col + 1); +// LinearMapper r2 = dest.getLinearMapper(row, col + 2); +// LinearMapper r3 = dest.getLinearMapper(row, col + 3); + +// r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); +// r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); +// r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); +// r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); +// } +// }; + +// template +// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> +// { +// using LinearMapper = typename DataMapper::LinearMapper; +// using AccPacket = typename packet_traits::type; +// using ResPacket = typename packet_traits::type; + +// PacketBlock _acc1; +// PacketBlock _acc2; + +// EIGEN_STRONG_INLINE void zero() +// { +// _acc1.packet[0] = pset1(0); +// _acc1.packet[1] = pset1(0); +// _acc1.packet[2] = pset1(0); +// _acc1.packet[3] = pset1(0); + +// _acc2.packet[0] = pset1(0); +// _acc2.packet[1] = pset1(0); +// _acc2.packet[2] = pset1(0); +// _acc2.packet[3] = pset1(0); +// } + +// template +// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) +// { +// _acc1.packet[0] *= pAlpha; +// _acc1.packet[1] *= pAlpha; +// _acc1.packet[2] *= pAlpha; +// _acc1.packet[3] *= pAlpha; + +// _acc2.packet[0] *= pAlpha; +// _acc2.packet[1] *= pAlpha; +// _acc2.packet[2] *= pAlpha; +// _acc2.packet[3] *= pAlpha; +// } + +// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) +// { +// LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); +// LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); +// LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); +// LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + +// LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); +// LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); +// LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); +// LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + + +// r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); +// r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); +// r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); +// r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); + +// r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); +// r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); +// r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); +// r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); +// } +// }; + +// #define MICRO_8x1x4() \ +// pLhs = pload(lhsPackMap.pCur); \ +// lhsPackMap.advance(4*1); \ +// pLhs2 = pload(lhsPackMap.pCur); \ +// pRhs = pload(rhsPackMap.pCur); \ +// pRhs0 = pset1(pRhs[0]); \ +// pRhs1 = pset1(pRhs[1]); \ +// pRhs2 = pset1(pRhs[2]); \ +// pRhs3 = pset1(pRhs[3]); \ +// acc._acc1.packet[0] += pLhs*pRhs0; \ +// acc._acc1.packet[1] += pLhs*pRhs1; \ +// acc._acc1.packet[2] += pLhs*pRhs2; \ +// acc._acc1.packet[3] += pLhs*pRhs3; \ +// acc._acc2.packet[0] += pLhs2*pRhs0; \ +// acc._acc2.packet[1] += pLhs2*pRhs1; \ +// acc._acc2.packet[2] += pLhs2*pRhs2; \ +// acc._acc2.packet[3] += pLhs2*pRhs3; \ +// lhsPackMap.advance(4*1); \ +// rhsPackMap.advance(1*4); + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); + +// LhsPacket pLhs, pLhs2; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); + +// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t"); + +// LhsPacket pLhs, pLhs2; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// prefetch(lhsPackMap.pCur + 2*32); +// prefetch(rhsPackMap.pCur + 2*16); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); +// MICRO_8x1x4(); + +// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); + +// LhsPacket pLhs, pLhs2; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// MICRO_8x1x4(); + +// asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); +// }; +// }; + +// #define MICRO_4x1x4() \ +// pLhs = pload(lhsPackMap.pCur); \ +// pRhs = pload(rhsPackMap.pCur); \ +// pRhs0 = pset1(pRhs[0]); \ +// pRhs1 = pset1(pRhs[1]); \ +// pRhs2 = pset1(pRhs[2]); \ +// pRhs3 = pset1(pRhs[3]); \ +// acc._acc.packet[0] += pLhs*pRhs0; \ +// acc._acc.packet[1] += pLhs*pRhs1; \ +// acc._acc.packet[2] += pLhs*pRhs2; \ +// acc._acc.packet[3] += pLhs*pRhs3; \ +// lhsPackMap.advance(4*1); \ +// rhsPackMap.advance(1*4); + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); +// LhsPacket pLhs; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); + +// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); +// LhsPacket pLhs; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// MICRO_4x1x4(); +// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t"); + +// LhsPacket pLhs; +// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +// MICRO_4x1x4(); + +// asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + +// LhsPacket pLhs = pload(lhsPackMap.pCur); +// RhsPacket pRhs = pset1(*rhsPackMap.pCur); + +// acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); +// lhsPackMap.advance(4*1); +// pLhs = pload(lhsPackMap.pCur); +// acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); + +// lhsPackMap.advance(4*1); +// rhsPackMap.advance(1); +// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using LhsPacket = typename packet_traits::type; +// using RhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + +// LhsPacket pLhs = pload(lhsPackMap.pCur); +// RhsPacket pRhs = pset1(*rhsPackMap.pCur); + +// //acc._acc += pRhs*pLhs; +// acc._acc = pmadd(pRhs, pLhs, acc._acc); + +// lhsPackMap.advance(4*1); +// rhsPackMap.advance(1); +// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); +// }; +// }; + +// template +// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4> +// { +// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, +// RhsPackMap& rhsPackMap, +// Index rowIdx, Index colIdx, Index depthIdx, +// Accumulator& acc) +// { +// using RhsPacket = typename packet_traits::type; +// using LhsPacket = typename packet_traits::type; + +// asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t"); + +// LhsPacket pLhs = pset1(*lhsPackMap.pCur); +// RhsPacket pRhs = pload(rhsPackMap.pCur); + +// acc._acc += pLhs*pRhs; + +// lhsPackMap.advance(1); +// rhsPackMap.advance(4*1); +// asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t"); +// }; +// }; } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 61af4e911..8871871f1 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -321,12 +321,14 @@ struct LhsLoopStruct Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) { constexpr auto lhsProgress = SHAPES[IDX][SHAPES_LHS_DIMENSION]; - + constexpr auto rhsProgress = SHAPES[IDX][SHAPES_RHS_DIMENSION]; DepthLoopStruct depthLS; for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress) { lhsPackMap.moveTo(rowIdx); rhsPackMap.moveTo(colIdx); + //prefetch(lhsPackMap.pCur + 2*lhsProgress); + //prefetch(rhsPackMap.pCur + 2*rhsProgress); depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); } lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); @@ -403,7 +405,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, co rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); asm __volatile__("#END_GEBP\n\t"); } - +/* template struct gemm_pack_rhs { @@ -459,7 +461,7 @@ void gemm_pack_lhs pack; pack(blockA, lhs, depth, rows, stride, offset); } - +*/ template struct gebp_kernel { diff --git a/Eigen/src/Core/arch/NEON/PackingOps.h b/Eigen/src/Core/arch/NEON/PackingOps.h index 40dbea568..1c1bcf92d 100644 --- a/Eigen/src/Core/arch/NEON/PackingOps.h +++ b/Eigen/src/Core/arch/NEON/PackingOps.h @@ -14,6 +14,54 @@ namespace Eigen { namespace internal { +template +struct PackMap<0, CPU, Index, Scalar, DataMapper, isLhs> +{ + const Scalar *pBase; + const Scalar *pCur; + Index stride; + Index offset; + Index d2Size; + + Index shift; + Index jump; + + PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) + { + shift = (d2Size / 4) * 4; + jump = shift; + } + + EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; } + + EIGEN_STRONG_INLINE void moveTo(Index p1) + { + Index offset; + if(isLhs) + { + if(p1 >= shift) + { + offset = static_cast(shift*d2Size + ((p1%4))*d2Size); + jump = 1; + } else { + offset = p1; + jump = shift; + } + } else { + offset = static_cast(4*d2Size*(p1/4)); + pCur = pBase + offset; + } + pCur = pBase + offset; + } + + EIGEN_STRONG_INLINE void advance(int progress) + { + Index offset = static_cast(isLhs ? jump : progress); + pCur += offset; + } +}; + +/* template constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3; @@ -180,7 +228,7 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane return c; } }; - +*/ } // end namespace internal