From 43ce8e9d2d6ef49bd2feba1b975c758dfc119472 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Thu, 22 Apr 2021 17:43:22 +0000 Subject: [PATCH] WIP --- Eigen/src/Core/arch/NEON/Kernels.h | 70 +++++++++++++++++ Eigen/src/Core/arch/NEON/MatrixProduct.h | 4 +- Eigen/src/Core/arch/NEON/PackingOps.h | 98 ++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 2f16841b9..0eb78f1b1 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -14,6 +14,76 @@ namespace Eigen { namespace internal { +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; + + PacketBlock _acc; + + EIGEN_STRONG_INLINE void zero() + { + _acc.packet[0] = pset1(0); + _acc.packet[1] = pset1(0); + _acc.packet[2] = pset1(0); + _acc.packet[3] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc.packet[0] *= pAlpha; + _acc.packet[1] *= pAlpha; + _acc.packet[2] *= pAlpha; + _acc.packet[3] *= pAlpha; + } + + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + LinearMapper r0 = dest.getLinearMapper(row, col + 0); + LinearMapper r1 = dest.getLinearMapper(row, col + 1); + LinearMapper r2 = dest.getLinearMapper(row, col + 2); + LinearMapper r3 = dest.getLinearMapper(row, col + 3); + + r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); + r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); + r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); + r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); + } +}; + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_3x1\n\t"); + LhsPacket pLhs = pload(lhsPackMap.pCur); + RhsPacket pRhs = pload(rhsPackMap.pCur); + RhsPacket pRhs0 = pset1(pRhs[0]); + RhsPacket pRhs1 = pset1(pRhs[1]); + RhsPacket pRhs2 = pset1(pRhs[2]); + RhsPacket pRhs3 = pset1(pRhs[3]); + + acc._acc.packet[0] += pLhs*pRhs0; + acc._acc.packet[1] += pLhs*pRhs1; + acc._acc.packet[2] += pLhs*pRhs2; + acc._acc.packet[3] += pLhs*pRhs3; + + lhsPackMap.advance(4*1); + rhsPackMap.advance(1*4); + asm __volatile__("#END_NEON_MICROKERNEL_3x1\n\t"); + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 88968b4f0..85c2d05f3 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -584,8 +584,8 @@ constexpr int SHAPES[SHAPES_COUNT][SHAPE template constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0}}; -template -constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0}}; +//template +//constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0}}; template struct PackingOperator diff --git a/Eigen/src/Core/arch/NEON/PackingOps.h b/Eigen/src/Core/arch/NEON/PackingOps.h index 72edb800f..bc1629a1a 100644 --- a/Eigen/src/Core/arch/NEON/PackingOps.h +++ b/Eigen/src/Core/arch/NEON/PackingOps.h @@ -14,6 +14,104 @@ namespace Eigen { namespace internal { +template +constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3; + +template +constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}}; + +template +struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4> +{ + EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data) + { + using Packet = typename packet_traits::type; + constexpr int vectorSize = packet_traits::size; + + Scalar *c = block; + + if(!isLhs) + { + int tD = d1Idx; + d1Idx = d2Idx; + d2Idx = tD; + } + + if(isLhs && StorageOrder == ColMajor || !isLhs && StorageOrder == RowMajor) + { + Packet p0 = data.template loadPacket(d1Idx, d2Idx + 0); + Packet p1 = data.template loadPacket(d1Idx, d2Idx + 1); + Packet p2 = data.template loadPacket(d1Idx, d2Idx + 2); + Packet p3 = data.template loadPacket(d1Idx, d2Idx + 3); + + pstore(c + 0*vectorSize, p0); + pstore(c + 1*vectorSize, p1); + pstore(c + 2*vectorSize, p2); + pstore(c + 3*vectorSize, p3); + c+=4*vectorSize; + } else { + PacketBlock pblock; + + pblock.packet[0] = data.template loadPacket(d1Idx, d2Idx + 0); + pblock.packet[1] = data.template loadPacket(d1Idx, d2Idx + 1); + pblock.packet[2] = data.template loadPacket(d1Idx, d2Idx + 2); + pblock.packet[3] = data.template loadPacket(d1Idx, d2Idx + 3); + + ptranspose(pblock); + + pstore(c + 0*vectorSize, pblock.packet[0]); + pstore(c + 1*vectorSize, pblock.packet[1]); + pstore(c + 2*vectorSize, pblock.packet[2]); + pstore(c + 3*vectorSize, pblock.packet[3]); + c+=4*vectorSize; + } + return c; + } +}; + +template +struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1> +{ + EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data) + { + using Packet = typename packet_traits::type; + Scalar *c = block; + if(isLhs && StorageOrder == ColMajor) + { + Packet p = data.template loadPacket(d1Idx, d2Idx); + pstore(c, p); + c+=4; + } else if(!isLhs && StorageOrder == RowMajor) { + Packet p = data.template loadPacket(d2Idx, d1Idx); + pstore(c, p); + c+=4; + } else { + if(isLhs) + { + *c = data(d1Idx + 0, d2Idx); + c++; + *c = data(d1Idx + 1, d2Idx); + c++; + *c = data(d1Idx + 2, d2Idx); + c++; + *c = data(d1Idx + 3, d2Idx); + c++; + } else { + *c = data(d2Idx, d1Idx + 0); + c++; + *c = data(d2Idx, d1Idx + 1); + c++; + *c = data(d2Idx, d1Idx + 2); + c++; + *c = data(d2Idx, d1Idx + 3); + c++; + } + } + return c; + } +}; + + } // end namespace internal } // end namespace Eigen