diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 1cb82eed5..17c7ce129 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -138,7 +138,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) reinterpret_cast<__vector unsigned char>(ploadRhs(rhs + (sizeof(Packet2d) / sizeof(double)))), reinterpret_cast<__vector unsigned char>(ploadRhs(rhs))); #else - __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs)); + rhsV = *reinterpret_cast<__vector_pair *>(const_cast(rhs)); #endif } @@ -147,6 +147,10 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) ploadRhsMMA(lhs, lhsV); } +#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11)) +#define VECTOR_PAIR_LOADS_LHS +#endif + // PEEL_MMA loop factor. #define PEEL_MMA 7 @@ -162,6 +166,7 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) pgerMMA(&accZero##iter, rhsV##peel, lhsV##iter); \ } +#ifdef VECTOR_PAIR_LOADS_LHS #define MICRO_MMA_WORK_TWO(iter, type, peel) \ if (unroll_factor > iter) { \ pgerMMA(&accZero##iter, rhsV##peel, lhsV2##iter.packet[peel & 1]); \ @@ -185,6 +190,7 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) } #define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) +#endif #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \ if (PEEL_MMA > peel) { \ @@ -196,6 +202,14 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7) +#else #define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \ if (PEEL_MMA > peel2) { \ PacketBlock lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \ @@ -215,6 +229,7 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \ MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \ MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7) +#endif #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ type rhsV0; \ @@ -224,11 +239,15 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \ rhs_ptr += (accRows * size); +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA) +#else #define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \ - MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \ + MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \ rhs_ptr += (accRows * size); #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA) +#endif #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1) @@ -406,6 +425,7 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ } +#ifdef VECTOR_PAIR_LOADS_LHS #define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \ if (unroll_factor > iter) { \ pgercMMA(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV##peel, rhsVi##peel); \ @@ -428,6 +448,7 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter) #define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) +#endif #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \ if (PEEL_COMPLEX_MMA > peel) { \ @@ -446,6 +467,13 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3) +#else #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \ if (PEEL_COMPLEX_MMA > peel2) { \ PacketBlock lhsV20, lhsV21, lhsV22, lhsV23; \ @@ -473,6 +501,7 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) +#endif #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ type rhsV0, rhsVi0; \ @@ -483,12 +512,16 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, rhs_ptr_real += (accRows * size); \ if(!RhsIsReal) rhs_ptr_imag += (accRows * size); +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA) +#else #define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \ - MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \ + MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \ rhs_ptr_real += (accRows * size); \ if(!RhsIsReal) rhs_ptr_imag += (accRows * size); #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA) +#endif #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)