From a8ec6d6a36c54006696b5aaa102e71650dcf64f1 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Wed, 12 May 2021 17:09:33 +0000 Subject: [PATCH] WIP with tests --- Eigen/src/Core/arch/NEON/Kernels.h | 515 +++++++++++++++-------- Eigen/src/Core/arch/NEON/MatrixProduct.h | 29 +- Eigen/src/Core/arch/NEON/PackingOps.h | 51 +-- compile.sh | 5 + new_gemm_test.cpp | 92 ++++ run.sh | 32 ++ 6 files changed, 506 insertions(+), 218 deletions(-) create mode 100755 compile.sh create mode 100644 new_gemm_test.cpp create mode 100755 run.sh diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index c7f56594a..bca70f593 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -14,52 +14,135 @@ namespace Eigen { namespace internal { -// template -// constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9; +#ifdef __ENABLE_VECTOR_KERNELS__ -// template -// constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = -// { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, -// /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END}, -// /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END}, -// /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END}, -// /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END}, -// /*5*/ {4,4,4, 2, 2, 3}, -// /*6*/ {4,8,4, 2, 2, 4}, -// /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END}, -// /*8*/ {8,4,4, 2, 4, 6}}; +#define MICRO_12x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ + pLhs2 = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ + pLhs3 = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc1.packet[0] += pLhs*pRhs0; \ + acc._acc1.packet[1] += pLhs*pRhs1; \ + acc._acc1.packet[2] += pLhs*pRhs2; \ + acc._acc1.packet[3] += pLhs*pRhs3; \ + acc._acc2.packet[0] += pLhs2*pRhs0; \ + acc._acc2.packet[1] += pLhs2*pRhs1; \ + acc._acc2.packet[2] += pLhs2*pRhs2; \ + acc._acc2.packet[3] += pLhs2*pRhs3; \ + acc._acc3.packet[0] += pLhs3*pRhs0; \ + acc._acc3.packet[1] += pLhs3*pRhs1; \ + acc._acc3.packet[2] += pLhs3*pRhs2; \ + acc._acc3.packet[3] += pLhs3*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); -// template -// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> -// { -// using LinearMapper = typename DataMapper::LinearMapper; -// using AccPacket = typename packet_traits::type; -// using ResPacket = typename packet_traits::type; +#define MICRO_8x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + lhsPackMap.advance(4*1); \ + pLhs2 = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc1.packet[0] += pLhs*pRhs0; \ + acc._acc1.packet[1] += pLhs*pRhs1; \ + acc._acc1.packet[2] += pLhs*pRhs2; \ + acc._acc1.packet[3] += pLhs*pRhs3; \ + acc._acc2.packet[0] += pLhs2*pRhs0; \ + acc._acc2.packet[1] += pLhs2*pRhs1; \ + acc._acc2.packet[2] += pLhs2*pRhs2; \ + acc._acc2.packet[3] += pLhs2*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); -// PacketBlock _acc; +#define MICRO_4x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc.packet[0] += pLhs*pRhs0; \ + acc._acc.packet[1] += pLhs*pRhs1; \ + acc._acc.packet[2] += pLhs*pRhs2; \ + acc._acc.packet[3] += pLhs*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); -// EIGEN_STRONG_INLINE void zero() -// { -// _acc.packet[0] = pset1(0); -// _acc.packet[1] = pset1(0); -// } +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; -// template -// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) -// { -// _acc.packet[0] *= pAlpha; -// _acc.packet[1] *= pAlpha; -// } + PacketBlock _acc; -// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) -// { -// PacketBlock block; -// block.packet[0] = dest.template loadPacket(row, col) + _acc.packet[0]; -// dest.template storePacketBlock(row, col, block); -// block.packet[0] = dest.template loadPacket(row + 4, col) + _acc.packet[1]; -// dest.template storePacketBlock(row + 4, col, block); -// } -// }; + EIGEN_STRONG_INLINE void zero() + { + _acc.packet[0] = pset1(0); + _acc.packet[1] = pset1(0); + _acc.packet[2] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc.packet[0] *= pAlpha; + _acc.packet[1] *= pAlpha; + _acc.packet[2] *= pAlpha; + } + + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + PacketBlock block; + block.packet[0] = dest.template loadPacket(row + 0, col) + _acc.packet[0]; + dest.template storePacketBlock(row + 0, col, block); + block.packet[0] = dest.template loadPacket(row + 4, col) + _acc.packet[1]; + dest.template storePacketBlock(row + 4, col, block); + block.packet[0] = dest.template loadPacket(row + 8, col) + _acc.packet[2]; + dest.template storePacketBlock(row + 8, col, block); + } +}; + +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; + + PacketBlock _acc; + + EIGEN_STRONG_INLINE void zero() + { + _acc.packet[0] = pset1(0); + _acc.packet[1] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc.packet[0] *= pAlpha; + _acc.packet[1] *= pAlpha; + } + + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + PacketBlock block; + block.packet[0] = dest.template loadPacket(row, col) + _acc.packet[0]; + dest.template storePacketBlock(row, col, block); + block.packet[0] = dest.template loadPacket(row + 4, col) + _acc.packet[1]; + dest.template storePacketBlock(row + 4, col, block); + } +}; template struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> @@ -156,87 +239,150 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> } }; -// template -// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> -// { -// using LinearMapper = typename DataMapper::LinearMapper; -// using AccPacket = typename packet_traits::type; -// using ResPacket = typename packet_traits::type; +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; -// PacketBlock _acc1; -// PacketBlock _acc2; + PacketBlock _acc1; + PacketBlock _acc2; -// EIGEN_STRONG_INLINE void zero() -// { -// _acc1.packet[0] = pset1(0); -// _acc1.packet[1] = pset1(0); -// _acc1.packet[2] = pset1(0); -// _acc1.packet[3] = pset1(0); + EIGEN_STRONG_INLINE void zero() + { + _acc1.packet[0] = pset1(0); + _acc1.packet[1] = pset1(0); + _acc1.packet[2] = pset1(0); + _acc1.packet[3] = pset1(0); -// _acc2.packet[0] = pset1(0); -// _acc2.packet[1] = pset1(0); -// _acc2.packet[2] = pset1(0); -// _acc2.packet[3] = pset1(0); -// } + _acc2.packet[0] = pset1(0); + _acc2.packet[1] = pset1(0); + _acc2.packet[2] = pset1(0); + _acc2.packet[3] = pset1(0); + } -// template -// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) -// { -// _acc1.packet[0] *= pAlpha; -// _acc1.packet[1] *= pAlpha; -// _acc1.packet[2] *= pAlpha; -// _acc1.packet[3] *= pAlpha; + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc1.packet[0] *= pAlpha; + _acc1.packet[1] *= pAlpha; + _acc1.packet[2] *= pAlpha; + _acc1.packet[3] *= pAlpha; -// _acc2.packet[0] *= pAlpha; -// _acc2.packet[1] *= pAlpha; -// _acc2.packet[2] *= pAlpha; -// _acc2.packet[3] *= pAlpha; -// } + _acc2.packet[0] *= pAlpha; + _acc2.packet[1] *= pAlpha; + _acc2.packet[2] *= pAlpha; + _acc2.packet[3] *= pAlpha; + } -// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) -// { -// LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); -// LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); -// LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); -// LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); + LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); + LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); + LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); -// LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); -// LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); -// LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); -// LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); + LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); + LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); + LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); -// r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); -// r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); -// r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); -// r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); + r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); + r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); + r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); + r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); -// r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); -// r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); -// r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); -// r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); -// } -// }; + r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); + r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); + r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); + r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); + } +}; -// #define MICRO_8x1x4() \ -// pLhs = pload(lhsPackMap.pCur); \ -// lhsPackMap.advance(4*1); \ -// pLhs2 = pload(lhsPackMap.pCur); \ -// pRhs = pload(rhsPackMap.pCur); \ -// pRhs0 = pset1(pRhs[0]); \ -// pRhs1 = pset1(pRhs[1]); \ -// pRhs2 = pset1(pRhs[2]); \ -// pRhs3 = pset1(pRhs[3]); \ -// acc._acc1.packet[0] += pLhs*pRhs0; \ -// acc._acc1.packet[1] += pLhs*pRhs1; \ -// acc._acc1.packet[2] += pLhs*pRhs2; \ -// acc._acc1.packet[3] += pLhs*pRhs3; \ -// acc._acc2.packet[0] += pLhs2*pRhs0; \ -// acc._acc2.packet[1] += pLhs2*pRhs1; \ -// acc._acc2.packet[2] += pLhs2*pRhs2; \ -// acc._acc2.packet[3] += pLhs2*pRhs3; \ -// lhsPackMap.advance(4*1); \ -// rhsPackMap.advance(1*4); +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; + + PacketBlock _acc1; + PacketBlock _acc2; + PacketBlock _acc3; + + EIGEN_STRONG_INLINE void zero() + { + _acc1.packet[0] = pset1(0); + _acc1.packet[1] = pset1(0); + _acc1.packet[2] = pset1(0); + _acc1.packet[3] = pset1(0); + + _acc2.packet[0] = pset1(0); + _acc2.packet[1] = pset1(0); + _acc2.packet[2] = pset1(0); + _acc2.packet[3] = pset1(0); + + _acc3.packet[0] = pset1(0); + _acc3.packet[1] = pset1(0); + _acc3.packet[2] = pset1(0); + _acc3.packet[3] = pset1(0); + } + + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc1.packet[0] *= pAlpha; + _acc1.packet[1] *= pAlpha; + _acc1.packet[2] *= pAlpha; + _acc1.packet[3] *= pAlpha; + + _acc2.packet[0] *= pAlpha; + _acc2.packet[1] *= pAlpha; + _acc2.packet[2] *= pAlpha; + _acc2.packet[3] *= pAlpha; + + _acc3.packet[0] *= pAlpha; + _acc3.packet[1] *= pAlpha; + _acc3.packet[2] *= pAlpha; + _acc3.packet[3] *= pAlpha; + } + + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); + LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); + LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); + LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + + LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); + LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); + LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); + LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + + LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0); + LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1); + LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2); + LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3); + + + r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); + r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); + r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); + r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); + + r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); + r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); + r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); + r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); + + r20.storePacket(0, r20.template loadPacket(0) + _acc3.packet[0]); + r21.storePacket(0, r21.template loadPacket(0) + _acc3.packet[1]); + r22.storePacket(0, r22.template loadPacket(0) + _acc3.packet[2]); + r23.storePacket(0, r23.template loadPacket(0) + _acc3.packet[3]); + } +}; // template // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> @@ -294,41 +440,49 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> // }; // }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); -// LhsPacket pLhs, pLhs2; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + LhsPacket pLhs, pLhs2, pLhs3; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; -// MICRO_8x1x4(); + MICRO_12x1x4(); -// asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); -// }; -// }; + asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); + }; +}; -#define MICRO_4x1x4() \ - pLhs = pload(lhsPackMap.pCur); \ - pRhs = pload(rhsPackMap.pCur); \ - pRhs0 = pset1(pRhs[0]); \ - pRhs1 = pset1(pRhs[1]); \ - pRhs2 = pset1(pRhs[2]); \ - pRhs3 = pset1(pRhs[3]); \ - acc._acc.packet[0] += pLhs*pRhs0; \ - acc._acc.packet[1] += pLhs*pRhs1; \ - acc._acc.packet[2] += pLhs*pRhs2; \ - acc._acc.packet[3] += pLhs*pRhs3; \ - lhsPackMap.advance(4*1); \ - rhsPackMap.advance(1*4); +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); + + LhsPacket pLhs, pLhs2; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + + MICRO_8x1x4(); + + asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); + }; +}; // template // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> @@ -377,6 +531,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> // MICRO_4x1x4(); // MICRO_4x1x4(); // MICRO_4x1x4(); + // asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); // }; // }; @@ -403,32 +558,62 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, }; }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); -// LhsPacket pLhs = pload(lhsPackMap.pCur); -// RhsPacket pRhs = pset1(*rhsPackMap.pCur); + LhsPacket pLhs = pload(lhsPackMap.pCur); + RhsPacket pRhs = pset1(*rhsPackMap.pCur); -// acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); -// lhsPackMap.advance(4*1); -// pLhs = pload(lhsPackMap.pCur); -// acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); + acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); + lhsPackMap.advance(4*1); + pLhs = pload(lhsPackMap.pCur); + acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); + lhsPackMap.advance(4*1); + pLhs = pload(lhsPackMap.pCur); + acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); -// lhsPackMap.advance(4*1); -// rhsPackMap.advance(1); -// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); -// }; -// }; + lhsPackMap.advance(4*1); + rhsPackMap.advance(1); + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; + +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + + LhsPacket pLhs = pload(lhsPackMap.pCur); + RhsPacket pRhs = pset1(*rhsPackMap.pCur); + + acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); + lhsPackMap.advance(4*1); + pLhs = pload(lhsPackMap.pCur); + acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); + + lhsPackMap.advance(4*1); + rhsPackMap.advance(1); + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1> @@ -446,8 +631,7 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, LhsPacket pLhs = pload(lhsPackMap.pCur); RhsPacket pRhs = pset1(*rhsPackMap.pCur); - //acc._acc += pRhs*pLhs; - acc._acc = pmadd(pRhs, pLhs, acc._acc); + acc._acc += pRhs*pLhs; lhsPackMap.advance(4*1); rhsPackMap.advance(1); @@ -478,6 +662,9 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t"); }; }; + +#endif // __ENABLE_VECTOR_KERNELS__ + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index 8871871f1..a901ccca6 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -19,7 +19,7 @@ namespace Eigen { namespace internal { template -constexpr int SHAPES_COUNT = 4; +constexpr int SHAPES_COUNT = 8; constexpr int SHAPES_DIMENSION = 6; constexpr int SHAPES_LHS_DIMENSION = 0; @@ -32,6 +32,10 @@ constexpr int SHAPES_POINTER_END = -1; template constexpr int PACK_SHAPES_COUNT = 2; + +template +constexpr int PACK_SHAPES_COUNT = 4; + constexpr int PACK_SHAPES_DIMENSION = 3; constexpr int PACK_SHAPES_POINTER = 2; constexpr int PACK_SHAPES_END = -1; @@ -39,14 +43,27 @@ constexpr int PACK_SHAPES_END = -1; // lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map template constexpr int SHAPES[SHAPES_COUNT][SHAPES_DIMENSION] = - { {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, - {4,1,1, 0, 0, SHAPES_POINTER_END}, - {1,1,4, 1, SHAPES_POINTER_END, SHAPES_POINTER_END}, - {4,1,4, 1, 2, SHAPES_POINTER_END}}; + { /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 1 */{1*packet_traits::size,1,1, 0, 0, SHAPES_POINTER_END}, + /* 2 */{2*packet_traits::size,1,1, 0, 1, SHAPES_POINTER_END}, + /* 3 */{3*packet_traits::size,1,1, 0, 2, SHAPES_POINTER_END}, + /* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 5 */{1*packet_traits::size,1,4, 3, 4, SHAPES_POINTER_END}, + /* 6 */{2*packet_traits::size,1,4, 3, 5, SHAPES_POINTER_END}, + /* 7 */{3*packet_traits::size,1,4, 3, 6, SHAPES_POINTER_END}}; // d1progress x d2progress template -constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0}}; +constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = +{{ 1, 1, PACK_SHAPES_END}, + { 4, 1, 0}}; + +template +constexpr int PACK_SHAPES[PACK_SHAPES_COUNT][PACK_SHAPES_DIMENSION] = +{{ 1, 1, PACK_SHAPES_END}, + {1*packet_traits::size, 1, 0}, + {2*packet_traits::size, 1, 1}, + {3*packet_traits::size, 1, 2}}; template struct PackingOperator diff --git a/Eigen/src/Core/arch/NEON/PackingOps.h b/Eigen/src/Core/arch/NEON/PackingOps.h index 1c1bcf92d..9c16a2f38 100644 --- a/Eigen/src/Core/arch/NEON/PackingOps.h +++ b/Eigen/src/Core/arch/NEON/PackingOps.h @@ -14,54 +14,8 @@ namespace Eigen { namespace internal { -template -struct PackMap<0, CPU, Index, Scalar, DataMapper, isLhs> -{ - const Scalar *pBase; - const Scalar *pCur; - Index stride; - Index offset; - Index d2Size; +#ifdef __ENABLE_CUSTOM_PACKING__ - Index shift; - Index jump; - - PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) - { - shift = (d2Size / 4) * 4; - jump = shift; - } - - EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; } - - EIGEN_STRONG_INLINE void moveTo(Index p1) - { - Index offset; - if(isLhs) - { - if(p1 >= shift) - { - offset = static_cast(shift*d2Size + ((p1%4))*d2Size); - jump = 1; - } else { - offset = p1; - jump = shift; - } - } else { - offset = static_cast(4*d2Size*(p1/4)); - pCur = pBase + offset; - } - pCur = pBase + offset; - } - - EIGEN_STRONG_INLINE void advance(int progress) - { - Index offset = static_cast(isLhs ? jump : progress); - pCur += offset; - } -}; - -/* template constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3; @@ -228,7 +182,8 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane return c; } }; -*/ + +#endif // __ENABLE_CUSTOM_PACKING__ } // end namespace internal diff --git a/compile.sh b/compile.sh new file mode 100755 index 000000000..90b4ae11c --- /dev/null +++ b/compile.sh @@ -0,0 +1,5 @@ +#!/bin/bash +echo 'Compiling with master' +g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto +echo 'Compiling current' +g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt \ No newline at end of file diff --git a/new_gemm_test.cpp b/new_gemm_test.cpp new file mode 100644 index 000000000..d471931ef --- /dev/null +++ b/new_gemm_test.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include + +using namespace Eigen; + +void set(MatrixXf& A, int m, int n, int id, int digits) +{ + for(auto i = 0; i < m; i++) + for(auto j = 0; j < n; j++) + A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j; +} + +int main(int argc, char* argv[]) +{ +#ifdef __DEBUG__ + int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n); + MatrixXf A = MatrixXf::Zero(m, k); + MatrixXf B = MatrixXf::Zero(k, n); + MatrixXf C = MatrixXf::Zero(m, n); + + set(A, m, k, 1, static_cast(std::log10(max)) + 1); + set(B, k, n, 2, static_cast(std::log10(max)) + 1); + + C = A*B; + + std::cout << A << std::endl; + std::cout << B << std::endl; + std::cout << C << std::endl; + + std::cout << std::endl; + + for(auto i = 0; i < m; i++) + { + for(auto j = 0; j < n; j++) + { + float acc=0; + for(auto kk = 0; kk < k; kk++) + { + acc += A(i,kk)*B(kk,j); + } + C(i,j) = acc; + //std::cout << acc << " "; + } + //std::cout << std::endl; + } + + std::cout << C << std::endl; +#else + int sz = 128; + int m = sz, k = sz, n = sz; + int RUNS = 500; + double time = 0; + + for(auto i = 0; i < RUNS; i++) + { + MatrixXf A = MatrixXf::Random(m,k); + MatrixXf B = MatrixXf::Random(k,n); + //set(A,m, k, 1); + //set(B,k, n, 2); + MatrixXf C = MatrixXf::Zero(m, n); + + std::clock_t start,end; + start = std::clock(); + C = A*B; + end = std::clock(); + + time += 1000.0*(end-start) / CLOCKS_PER_SEC; + } + std::cout << time << std::endl; +#ifdef TEST_SCALAR + start = std::clock(); + for(auto i = 0; i < m; i++) + { + for(auto j = 0; j < n; j++) + { + float acc=0; + for(auto kk = 0; kk < k; kk++) + { + acc += A(i,kk)*B(kk,j); + } + C(i,j) = acc; + } + } + end = std::clock(); + + std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl; +#endif +#endif + return 0; +} \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 000000000..1f9ba96f0 --- /dev/null +++ b/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +echo 'Running with master' +T_OLD1=$(./gto) +echo $T_OLD1 +echo 'Running current' +T_NEW1=$(./gt) +echo $T_NEW1 +echo 'Running with master' +T_OLD2=$(./gto) +echo $T_OLD2 +echo 'Running with master' +T_OLD3=$(./gto) +echo $T_OLD3 +echo 'Running current' +T_NEW2=$(./gt) +echo $T_NEW2 +echo 'Running with master' +T_OLD4=$(./gto) +echo $T_OLD4 +echo 'Running current' +T_NEW3=$(./gt) +echo $T_NEW3 +echo 'Running current' +T_NEW4=$(./gt) +echo $T_NEW4 +echo 'Running with master' +T_OLD5=$(./gto) +echo $T_OLD5 +echo 'Running current' +T_NEW5=$(./gt) +echo $T_NEW5 +echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l