From 9b8cdceea85d6bcaba287959b0ee0f522de2be54 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Thu, 13 May 2021 14:42:22 +0000 Subject: [PATCH] WIP 2 --- Eigen/src/Core/arch/NEON/Kernels.h | 317 ++++++++++++----------- Eigen/src/Core/arch/NEON/MatrixProduct.h | 37 ++- compile.sh | 6 +- new_gemm_test.cpp | 21 +- run.sh | 63 ++--- 5 files changed, 248 insertions(+), 196 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index bca70f593..ef5591ea1 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -92,6 +92,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1> _acc.packet[2] = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -127,6 +129,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> _acc.packet[1] = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -157,6 +161,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> { _acc = pset1(0); } + + EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) @@ -186,6 +192,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> _acc = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -216,6 +224,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> _acc.packet[3] = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) + { + dest.getLinearMapper(row, col + 0).prefetch(0); + dest.getLinearMapper(row, col + 1).prefetch(0); + dest.getLinearMapper(row, col + 2).prefetch(0); + dest.getLinearMapper(row, col + 3).prefetch(0); + } + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -227,15 +243,17 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) { + constexpr auto PacketSize = unpacket_traits::size; + LinearMapper r0 = dest.getLinearMapper(row, col + 0); LinearMapper r1 = dest.getLinearMapper(row, col + 1); LinearMapper r2 = dest.getLinearMapper(row, col + 2); LinearMapper r3 = dest.getLinearMapper(row, col + 3); - r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); - r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); - r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); - r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); + r0.storePacket(0*PacketSize, r0.template loadPacket(0*PacketSize) + _acc.packet[0]); + r1.storePacket(0*PacketSize, r1.template loadPacket(0*PacketSize) + _acc.packet[1]); + r2.storePacket(0*PacketSize, r2.template loadPacket(0*PacketSize) + _acc.packet[2]); + r3.storePacket(0*PacketSize, r3.template loadPacket(0*PacketSize) + _acc.packet[3]); } }; @@ -262,6 +280,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> _acc2.packet[3] = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) + { + dest.getLinearMapper(row + 0, col + 0).prefetch(0); + dest.getLinearMapper(row + 0, col + 1).prefetch(0); + dest.getLinearMapper(row + 0, col + 2).prefetch(0); + dest.getLinearMapper(row + 0, col + 3).prefetch(0); + } + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -278,26 +304,22 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) { - LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); - LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); - LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); - LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + constexpr auto PacketSize = unpacket_traits::size; - LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); - LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); - LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); - LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + LinearMapper r0 = dest.getLinearMapper(row, col + 0); + LinearMapper r1 = dest.getLinearMapper(row, col + 1); + LinearMapper r2 = dest.getLinearMapper(row, col + 2); + LinearMapper r3 = dest.getLinearMapper(row, col + 3); + r0.storePacket(0*PacketSize, r0.template loadPacket(0*PacketSize) + _acc1.packet[0]); + r1.storePacket(0*PacketSize, r1.template loadPacket(0*PacketSize) + _acc1.packet[1]); + r2.storePacket(0*PacketSize, r2.template loadPacket(0*PacketSize) + _acc1.packet[2]); + r3.storePacket(0*PacketSize, r3.template loadPacket(0*PacketSize) + _acc1.packet[3]); - r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); - r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); - r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); - r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); - - r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); - r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); - r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); - r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); + r0.storePacket(1*PacketSize, r0.template loadPacket(1*PacketSize) + _acc2.packet[0]); + r1.storePacket(1*PacketSize, r1.template loadPacket(1*PacketSize) + _acc2.packet[1]); + r2.storePacket(1*PacketSize, r2.template loadPacket(1*PacketSize) + _acc2.packet[2]); + r3.storePacket(1*PacketSize, r3.template loadPacket(1*PacketSize) + _acc2.packet[3]); } }; @@ -330,6 +352,14 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4> _acc3.packet[3] = pset1(0); } + EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col) + { + dest.getLinearMapper(row + 0, col + 0).prefetch(0); + dest.getLinearMapper(row + 0, col + 1).prefetch(0); + dest.getLinearMapper(row + 0, col + 2).prefetch(0); + dest.getLinearMapper(row + 0, col + 3).prefetch(0); + } + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) { @@ -351,94 +381,139 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4> EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) { - LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); - LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1); - LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); - LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); + constexpr auto PacketSize = unpacket_traits::size; - LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0); - LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); - LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); - LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); + LinearMapper r0 = dest.getLinearMapper(row, col + 0); + LinearMapper r1 = dest.getLinearMapper(row, col + 1); + LinearMapper r2 = dest.getLinearMapper(row, col + 2); + LinearMapper r3 = dest.getLinearMapper(row, col + 3); - LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0); - LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1); - LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2); - LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3); + r0.storePacket(0*PacketSize, r0.template loadPacket(0*PacketSize) + _acc1.packet[0]); + r1.storePacket(0*PacketSize, r1.template loadPacket(0*PacketSize) + _acc1.packet[1]); + r2.storePacket(0*PacketSize, r2.template loadPacket(0*PacketSize) + _acc1.packet[2]); + r3.storePacket(0*PacketSize, r3.template loadPacket(0*PacketSize) + _acc1.packet[3]); + r0.storePacket(1*PacketSize, r0.template loadPacket(1*PacketSize) + _acc2.packet[0]); + r1.storePacket(1*PacketSize, r1.template loadPacket(1*PacketSize) + _acc2.packet[1]); + r2.storePacket(1*PacketSize, r2.template loadPacket(1*PacketSize) + _acc2.packet[2]); + r3.storePacket(1*PacketSize, r3.template loadPacket(1*PacketSize) + _acc2.packet[3]); - r00.storePacket(0, r00.template loadPacket(0) + _acc1.packet[0]); - r01.storePacket(0, r01.template loadPacket(0) + _acc1.packet[1]); - r02.storePacket(0, r02.template loadPacket(0) + _acc1.packet[2]); - r03.storePacket(0, r03.template loadPacket(0) + _acc1.packet[3]); - - r10.storePacket(0, r10.template loadPacket(0) + _acc2.packet[0]); - r11.storePacket(0, r11.template loadPacket(0) + _acc2.packet[1]); - r12.storePacket(0, r12.template loadPacket(0) + _acc2.packet[2]); - r13.storePacket(0, r13.template loadPacket(0) + _acc2.packet[3]); - - r20.storePacket(0, r20.template loadPacket(0) + _acc3.packet[0]); - r21.storePacket(0, r21.template loadPacket(0) + _acc3.packet[1]); - r22.storePacket(0, r22.template loadPacket(0) + _acc3.packet[2]); - r23.storePacket(0, r23.template loadPacket(0) + _acc3.packet[3]); + r0.storePacket(2*PacketSize, r0.template loadPacket(2*PacketSize) + _acc3.packet[0]); + r1.storePacket(2*PacketSize, r1.template loadPacket(2*PacketSize) + _acc3.packet[1]); + r2.storePacket(2*PacketSize, r2.template loadPacket(2*PacketSize) + _acc3.packet[2]); + r3.storePacket(2*PacketSize, r3.template loadPacket(2*PacketSize) + _acc3.packet[3]); } }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); -// LhsPacket pLhs, pLhs2; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); +#if __UNROLL__ > 4 + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); + MICRO_4x1x4(); +#endif -// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); -// }; -// }; + asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); + }; +}; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); -// LhsPacket pLhs, pLhs2; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + LhsPacket pLhs, pLhs2; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; -// prefetch(lhsPackMap.pCur + 2*32); -// prefetch(rhsPackMap.pCur + 2*16); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); -// MICRO_8x1x4(); +#if __UNROLL__ == 8 +#ifdef __ENABLE_PREFETCH__ + prefetch(rhsPackMap.pCur + (48+0)); +#endif + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); +#ifdef __ENABLE_PREFETCH__ + prefetch(rhsPackMap.pCur + (48+16)); +#endif + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); +#else + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); + MICRO_8x1x4(); +#endif + asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); + }; +}; -// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t"); -// }; -// }; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; + + asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t"); + + LhsPacket pLhs, pLhs2, pLhs3; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + +#if __UNROLL__ == 8 +#ifdef __ENABLE_PREFETCH__ + prefetch(rhsPackMap.pCur); +#endif + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); +#else + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); + MICRO_12x1x4(); +#endif + asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t"); + }; +}; template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4> @@ -451,14 +526,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, using LhsPacket = typename packet_traits::type; using RhsPacket = typename packet_traits::type; - asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t"); LhsPacket pLhs, pLhs2, pLhs3; RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; MICRO_12x1x4(); - asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); + asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t"); }; }; @@ -484,58 +559,6 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, }; }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; - -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); -// LhsPacket pLhs; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); - -// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); -// }; -// }; - -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; - -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t"); -// LhsPacket pLhs; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; - -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); -// MICRO_4x1x4(); - -// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t"); -// }; -// }; - template struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> { diff --git a/Eigen/src/Core/arch/NEON/MatrixProduct.h b/Eigen/src/Core/arch/NEON/MatrixProduct.h index a901ccca6..3e509f29e 100644 --- a/Eigen/src/Core/arch/NEON/MatrixProduct.h +++ b/Eigen/src/Core/arch/NEON/MatrixProduct.h @@ -18,8 +18,12 @@ namespace Eigen { namespace internal { +#ifndef __UNROLL__ +#define __UNROLL__ 8 +#endif + template -constexpr int SHAPES_COUNT = 8; +constexpr int SHAPES_COUNT = 11; constexpr int SHAPES_DIMENSION = 6; constexpr int SHAPES_LHS_DIMENSION = 0; @@ -43,14 +47,17 @@ constexpr int PACK_SHAPES_END = -1; // lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map template constexpr int SHAPES[SHAPES_COUNT][SHAPES_DIMENSION] = - { /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /* 1 */{1*packet_traits::size,1,1, 0, 0, SHAPES_POINTER_END}, - /* 2 */{2*packet_traits::size,1,1, 0, 1, SHAPES_POINTER_END}, - /* 3 */{3*packet_traits::size,1,1, 0, 2, SHAPES_POINTER_END}, - /* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END}, - /* 5 */{1*packet_traits::size,1,4, 3, 4, SHAPES_POINTER_END}, - /* 6 */{2*packet_traits::size,1,4, 3, 5, SHAPES_POINTER_END}, - /* 7 */{3*packet_traits::size,1,4, 3, 6, SHAPES_POINTER_END}}; + { /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 01 */{1*packet_traits::size, 1,1, 0, 0, SHAPES_POINTER_END}, + /* 02 */{2*packet_traits::size, 1,1, 0, 1, SHAPES_POINTER_END}, + /* 03 */{3*packet_traits::size, 1,1, 0, 2, SHAPES_POINTER_END}, + /* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END}, + /* 05 */{1*packet_traits::size, 1,4, 3, 4, SHAPES_POINTER_END}, + /* 06 */{1*packet_traits::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END}, + /* 07 */{2*packet_traits::size, 1,4, 3, 6, SHAPES_POINTER_END}, + /* 08 */{2*packet_traits::size,__UNROLL__,4, 3, 6, 7}, + /* 09 */{3*packet_traits::size, 1,4, 3, 8, SHAPES_POINTER_END}, + /* 10 */{3*packet_traits::size,__UNROLL__,4, 3, 8, 9}}; // d1progress x d2progress template @@ -225,6 +232,8 @@ struct Accumulator } } + EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {} + template EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha) { @@ -305,11 +314,21 @@ struct DepthLoopStruct constexpr auto lhsProgress = SHAPES[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION]; constexpr auto depthProgress = SHAPES[IDX][SHAPES_DEP_DIMENSION]; +#ifdef __ENABLE_PREFETCH__ + prefetch(lhsPackMap.pCur); + prefetch(rhsPackMap.pCur); +#endif + typedef Accumulator AccumulatorType; MicroKernel mkt; AccumulatorType acc; acc.zero(); + +#ifdef __ENABLE_PREFETCH__ + acc.prefetch(res, rowIdx, colIdx); +#endif + for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress) { mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc); diff --git a/compile.sh b/compile.sh index 90b4ae11c..3490ec5e5 100755 --- a/compile.sh +++ b/compile.sh @@ -1,5 +1,5 @@ #!/bin/bash -echo 'Compiling with master' -g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto +#echo 'Compiling with master' +#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto echo 'Compiling current' -g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt \ No newline at end of file +g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gt \ No newline at end of file diff --git a/new_gemm_test.cpp b/new_gemm_test.cpp index d471931ef..f6fbbca48 100644 --- a/new_gemm_test.cpp +++ b/new_gemm_test.cpp @@ -15,10 +15,11 @@ void set(MatrixXf& A, int m, int n, int id, int digits) int main(int argc, char* argv[]) { #ifdef __DEBUG__ - int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n); + int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n); MatrixXf A = MatrixXf::Zero(m, k); MatrixXf B = MatrixXf::Zero(k, n); MatrixXf C = MatrixXf::Zero(m, n); + MatrixXf D = MatrixXf::Zero(m, n); set(A, m, k, 1, static_cast(std::log10(max)) + 1); set(B, k, n, 2, static_cast(std::log10(max)) + 1); @@ -40,17 +41,25 @@ int main(int argc, char* argv[]) { acc += A(i,kk)*B(kk,j); } - C(i,j) = acc; - //std::cout << acc << " "; + D(i,j) = acc; + if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5) + { + std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl; + } } - //std::cout << std::endl; } std::cout << C << std::endl; #else - int sz = 128; + if(argc < 3) + { + std::cout << "Wrong number of arguments." << std::endl; + return -1; + } + + int sz = std::atoi(argv[1]); int m = sz, k = sz, n = sz; - int RUNS = 500; + int RUNS = std::atoi(argv[2]); double time = 0; for(auto i = 0; i < RUNS; i++) diff --git a/run.sh b/run.sh index 1f9ba96f0..eed6ae34f 100755 --- a/run.sh +++ b/run.sh @@ -1,32 +1,33 @@ #!/bin/bash -echo 'Running with master' -T_OLD1=$(./gto) -echo $T_OLD1 -echo 'Running current' -T_NEW1=$(./gt) -echo $T_NEW1 -echo 'Running with master' -T_OLD2=$(./gto) -echo $T_OLD2 -echo 'Running with master' -T_OLD3=$(./gto) -echo $T_OLD3 -echo 'Running current' -T_NEW2=$(./gt) -echo $T_NEW2 -echo 'Running with master' -T_OLD4=$(./gto) -echo $T_OLD4 -echo 'Running current' -T_NEW3=$(./gt) -echo $T_NEW3 -echo 'Running current' -T_NEW4=$(./gt) -echo $T_NEW4 -echo 'Running with master' -T_OLD5=$(./gto) -echo $T_OLD5 -echo 'Running current' -T_NEW5=$(./gt) -echo $T_NEW5 -echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l +function run() { + OLD=0 + NEW=0 + EXECS=$1 + SIZE=$2 + RUNS=$3 + for ((i = 0; i < $EXECS; i++)) do + SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2))) + if [ $SEL -eq 0 ]; then + T_OLD=$(./gto $SIZE $RUNS) + #echo "Master: $T_OLD" + OLD=$OLD+$T_OLD + T_NEW=$(./gt $SIZE $RUNS) + #echo "Current: $T_NEW" + else + T_NEW=$(./gt $SIZE $RUNS) + #echo "Current: $T_NEW" + T_OLD=$(./gto $SIZE $RUNS) + #echo "Master: $T_OLD" + OLD=$OLD+$T_OLD + fi + NEW=$NEW+$T_NEW + done + SPEED=$(echo "($OLD) / ($NEW)" | bc -l) + echo "$SIZE -> $SPEED" +} + +run $1 16 500 +run $1 32 500 +run $1 64 500 +run $1 128 100 +run $1 256 100