diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 4cc0a94ff..3b3b55890 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -1159,6 +1159,29 @@ EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& re } } +#ifdef USE_PARTIAL_PACKETS +template +EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements) +{ + for (Index M = 0; M < N; M++) { + acc.packet[M] = res.template loadPacketPartial(row, M, elements); + } + if (Complex && full) { + for (Index M = 0; M < N; M++) { + acc.packet[M+N] = res.template loadPacketPartial(row + accCols, M, elements); + } + } +} + +template +EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements) +{ + for (Index M = 0; M < N; M++) { + res.template storePacketPartial(row, M, acc.packet[M], elements); + } +} +#endif + #ifdef _ARCH_PWR10 #define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11)) #else @@ -1199,6 +1222,14 @@ EIGEN_ALWAYS_INLINE Packet2d bmask(const Index remaining_rows) #endif } +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + for (int M = 0; M < N; M++) { + acc.packet[M] = pmadd(pAlpha, accZ.packet[M], acc.packet[M]); + } +} + // Scale the PacketBlock vectors by alpha. template EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) @@ -1209,9 +1240,7 @@ EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock(pAlpha, accZ.packet[M], acc.packet[M]); - } + bscale(acc, accZ, pAlpha); } template @@ -1461,6 +1490,13 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0); } +#ifdef USE_PARTIAL_PACKETS + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(pMask); + bload_partial(acc, res, row, remaining_rows); + bscale(acc, accZero0, pAlpha); + bstore_partial(acc, res, row, remaining_rows); +#else bload(acc, res, row, 0); if ((accRows == 1) || (rows >= accCols)) { @@ -1474,6 +1510,7 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( } } } +#endif } #define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \ @@ -1565,16 +1602,35 @@ EIGEN_ALWAYS_INLINE void gemm_extra_row( #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) +#ifdef USE_PARTIAL_PACKETS +#define MICRO_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + if (MICRO_NORMAL_PARTIAL(iter)) { \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha); \ + bstore(acc, res, row + iter*accCols); \ + } else { \ + bload_partial(acc, res, row + iter*accCols, accCols2); \ + bscale(acc, accZero##iter, pAlpha); \ + bstore_partial(acc, res, row + iter*accCols, accCols2); \ + } \ + } +#else #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ bload(acc, res, row + iter*accCols, 0); \ bscale(acc, accZero##iter, pAlpha, pMask); \ bstore(acc, res, row + iter*accCols); \ } +#endif #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) +#ifdef USE_PARTIAL_PACKETS +template +#else template +#endif EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -1585,7 +1641,12 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( Index strideB, Index& row, const Packet& pAlpha, - const Packet& pMask) +#ifdef USE_PARTIAL_PACKETS + Index accCols2 +#else + const Packet& pMask +#endif + ) { const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; @@ -1612,9 +1673,15 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( MICRO_UPDATE } +#ifdef USE_PARTIAL_PACKETS +#define MICRO_UNROLL_ITER2(N, M) \ + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \ + if (M) return; +#else #define MICRO_UNROLL_ITER2(N, M) \ gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \ if (M) return; +#endif template EIGEN_ALWAYS_INLINE void gemm_cols( @@ -2094,22 +2161,22 @@ EIGEN_ALWAYS_INLINE void gemm_complex_cols( switch( (rows-row)/accCols ) { #if MAX_COMPLEX_UNROLL > 4 case 4: - MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4) break; #endif #if MAX_COMPLEX_UNROLL > 3 case 3: - MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3) break; #endif #if MAX_COMPLEX_UNROLL > 2 case 2: - MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2) break; #endif #if MAX_COMPLEX_UNROLL > 1 case 1: - MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1) break; #endif default: diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 70b95da3c..28868ca5a 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -5,6 +5,10 @@ #define EIGEN_POWER_PREFETCH(p) #endif +#ifdef _ARCH_PWR9 +#define USE_PARTIAL_PACKETS +#endif + #include "../../InternalHeaderCheck.h" namespace Eigen { @@ -89,6 +93,17 @@ EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const D template EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& res, Index row); +#ifdef USE_PARTIAL_PACKETS +template +EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements); + +template +EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements); +#endif + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); + template EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask); @@ -101,7 +116,7 @@ EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccReal, PacketBlock& taccReal, PacketBlock +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc) +#else template EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc) +#endif { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; +#ifdef USE_PARTIAL_PACKETS + if (full) { + EIGEN_UNUSED_VARIABLE(elements); + bload(tRes, data, i, 0); + bscale(tRes, result, alpha); + bstore(tRes, data, i); + } else { + bload_partial(tRes, data, i, elements); + bscale(tRes, result, alpha); + bstore_partial(tRes, data, i, elements); + } +#else bload(tRes, data, i, 0); - bscale(tRes, result, alpha, pMask); - bstore(tRes, data, i); +#endif } template @@ -270,14 +286,25 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE) +#ifdef USE_PARTIAL_PACKETS +#define MICRO_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeAccumulator(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \ + } +#else #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ storeAccumulator(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \ } +#endif #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) +#ifdef USE_PARTIAL_PACKETS +template +#else template +#endif EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -287,7 +314,12 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( Index offsetA, Index& row, const Packet& pAlpha, - const Packet& pMask) +#ifdef USE_PARTIAL_PACKETS + Index accCols2 +#else + const Packet& pMask +#endif + ) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; @@ -312,9 +344,15 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( MICRO_UPDATE } +#ifdef USE_PARTIAL_PACKETS +#define MICRO_MMA_UNROLL_ITER2(N, M) \ + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \ + if (M) return; +#else #define MICRO_MMA_UNROLL_ITER2(N, M) \ gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \ if (M) return; +#endif template EIGEN_ALWAYS_INLINE void gemmMMA_cols( @@ -643,22 +681,22 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( switch( (rows-row)/accCols ) { #if MAX_COMPLEX_MMA_UNROLL > 4 case 4: - MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4) break; #endif #if MAX_COMPLEX_MMA_UNROLL > 3 case 3: - MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3) break; #endif #if MAX_COMPLEX_MMA_UNROLL > 2 case 2: - MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2) break; #endif #if MAX_COMPLEX_MMA_UNROLL > 1 case 1: - MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1) + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1) break; #endif default: diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 4dd53f6a4..fceff293f 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -513,6 +513,7 @@ EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* f eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_DEBUG_ALIGNED_LOAD EIGEN_UNUSED_VARIABLE(from); Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); @@ -645,6 +646,7 @@ template EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPAC eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE Packet store = from; @@ -1215,6 +1217,7 @@ template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const eigen_assert(n <= packet_size && "number of elements will read past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_DEBUG_ALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); @@ -1402,6 +1405,7 @@ template EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPA eigen_assert(n <= packet_size && "number of elements will write past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_DEBUG_UNALIGNED_STORE vec_xst_len(from, to, n * size); #else diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 56473a9ac..3f0007783 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -292,7 +292,7 @@ public: } template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/ = 0) const { return pgather_partial(m_data + i*m_incr.value(), m_incr.value(), n); } @@ -302,7 +302,7 @@ public: } template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/ = 0) const { pscatter_partial(m_data + i*m_incr.value(), p, m_incr.value(), n); } @@ -343,7 +343,7 @@ public: } template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/ = 0) const { return pgather_partial(&operator()(i, j),m_incr.value(),n); } @@ -358,7 +358,7 @@ public: } template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/ = 0) const { pscatter_partial(&operator()(i, j), p, m_incr.value(), n); }