diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 1c8551c04..2919dda8b 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -193,7 +193,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc const_blas_data_mapper, Index, StorageOrder> lhs(_lhs, lhsStride); const Index vectorSize = quad_traits::vectorsize; const Index vectorDelta = vectorSize * depth; - Scalar* blockAf = (Scalar *)(blockA); + Scalar* blockAf = reinterpret_cast(blockA); Index rir = 0, rii, j = 0; for(; j + vectorSize <= rows; j+=vectorSize) @@ -1269,8 +1269,8 @@ const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; -template -EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) +template +EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); // Not used @@ -1284,7 +1284,7 @@ EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) } template<> -EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const Index remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); // Not used @@ -1748,7 +1748,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const if( strideB == -1 ) strideB = depth; const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); Index col = 0; for(; col + accRows <= cols; col += accRows) @@ -2208,7 +2208,7 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl const Packet pAlphaReal = pset1(alpha.real()); const Packet pAlphaImag = pset1(alpha.imag()); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); const Scalar* blockA = (Scalar *) blockAc; const Scalar* blockB = (Scalar *) blockBc; diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 768d9c7c4..d92b67815 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -44,8 +44,8 @@ EIGEN_STRONG_INLINE void gemm_extra_cols( const Packet& pAlpha, const Packet& pMask); -template -EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); +template +EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows); template EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( @@ -87,7 +87,7 @@ template EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 9a3132276..8104697a1 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,7 +11,9 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H +#if !EIGEN_COMP_LLVM #pragma GCC target("cpu=power10,htm") +#endif #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) @@ -80,7 +82,7 @@ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L template EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) { - __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); + __vector_pair* a0 = reinterpret_cast<__vector_pair *>(const_cast(&a.packet[0])); if(NegativeAccumulate) { __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b); @@ -133,8 +135,8 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) template<> EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) { - rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); - rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); + rhsV.packet[0] = ploadRhs(rhs); + rhsV.packet[1] = ploadRhs(rhs + (sizeof(Packet2d) / sizeof(double))); } template<> @@ -142,8 +144,8 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, _ { #if EIGEN_COMP_LLVM __builtin_vsx_assemble_pair(&rhsV, - (__vector unsigned char)(ploadRhs((const double *)(((Packet2d *)rhs) + 1))), - (__vector unsigned char)(ploadRhs((const double *)((Packet2d *)rhs )))); + reinterpret_cast<__vector unsigned char>(ploadRhs(rhs + (sizeof(Packet2d) / sizeof(double)))), + reinterpret_cast<__vector unsigned char>(ploadRhs(rhs))); #else __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs)); #endif @@ -360,7 +362,7 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, if( strideB == -1 ) strideB = depth; const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); Index col = 0; for(; col + accRows <= cols; col += accRows) @@ -595,7 +597,7 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS const Packet pAlphaReal = pset1(alpha.real()); const Packet pAlphaImag = pset1(alpha.imag()); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); const Scalar* blockA = (Scalar *) blockAc; const Scalar* blockB = (Scalar *) blockBc; @@ -613,7 +615,9 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS #undef advanceRows #undef advanceCols +#if !EIGEN_COMP_LLVM #pragma GCC reset_options +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h index 86ce4631d..d40ae534f 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h @@ -167,7 +167,7 @@ EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScal if (GEMV_GETN(N) > iter1) { \ if (GEMV_IS_FLOAT) { \ LhsPacket h[2]; \ - __builtin_vsx_disassemble_pair((void*)(h), &b##iter2); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(h), &b##iter2); \ pger_vecMMA_acc(&e##iter2, a0, h[0]); \ pger_vecMMA_acc(&e##iter3, a0, h[1]); \ } else { \ @@ -302,6 +302,8 @@ EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScal #define GEMV_INIT(iter, N) \ if (N > iter) { \ c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ } #ifdef EIGEN_POWER_USE_GEMV_PREFETCH @@ -407,9 +409,11 @@ EIGEN_STRONG_INLINE void gemv_col( RhsPacketSize = Traits::RhsPacketSize, }; +#ifndef GCC_ONE_VECTORPAIR_BUG const Index n8 = rows - 8 * ResPacketSize + 1; const Index n4 = rows - 4 * ResPacketSize + 1; const Index n2 = rows - 2 * ResPacketSize + 1; +#endif const Index n1 = rows - 1 * ResPacketSize + 1; #ifdef EIGEN_POWER_USE_GEMV_PREFETCH const Index prefetch_dist = 64 * LhsPacketSize; @@ -1289,10 +1293,10 @@ EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector gemv_mult_complex_real_MMA(a0, b, c0); \ } -GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float); -GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double); -GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float); -GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double); +GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float) +GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double) /** \internal disassemble MMA accumulator results into packets */ template @@ -1439,7 +1443,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock iter1) { \ if (GEMV_IS_COMPLEX_FLOAT) { \ PLhsPacket g[2]; \ - __builtin_vsx_disassemble_pair((void*)(g), &a##iter2); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(g), &a##iter2); \ gemv_mult_complex_MMA(g[0], b, &e0##iter2); \ gemv_mult_complex_MMA(g[1], b, &e0##iter3); \ } else { \ @@ -1525,12 +1529,17 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock iter) { \ c0##iter = pset_zero(); \ c1##iter = pset_init(c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ } #define GEMV_WORK_COL_COMPLEX(iter, N) \ if (N > iter) { \ f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ gemv_mult_complex(f##iter, b, c0##iter, c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(f##iter); \ } #define GEMV_STORE_COL_COMPLEX(iter, N) \ @@ -1616,9 +1625,11 @@ EIGEN_STRONG_INLINE void gemv_complex_col( const Index prefetch_dist = 64 * LhsPacketSize; #endif +#ifndef GCC_ONE_VECTORPAIR_BUG const Index n8 = rows - 8 * ResPacketSize + 1; const Index n4 = rows - 4 * ResPacketSize + 1; const Index n2 = rows - 2 * ResPacketSize + 1; +#endif const Index n1 = rows - 1 * ResPacketSize + 1; // TODO: improve the following heuristic: @@ -1661,10 +1672,10 @@ EIGEN_STRONG_INLINE void gemv_complex_col( { GEMV_PROCESS_COL_COMPLEX(2) } + if (i < n1) #else while (i < n1) #endif - if (i < n1) { GEMV_PROCESS_COL_COMPLEX_ONE(1) } @@ -1861,11 +1872,15 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPa } else { \ cc##iter1 = predux_real(&c##iter1); \ } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ } #else #define GEMV_INIT_ROW(iter, N) \ if (N > iter) { \ c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ } #define GEMV_WORK_ROW(iter, N) \ @@ -1876,6 +1891,8 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPa #define GEMV_PREDUX2(iter1, iter2, iter3, N) \ if (N > iter1) { \ cc##iter1 = predux_real(c##iter2, c##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ } #endif @@ -1933,9 +1950,11 @@ EIGEN_STRONG_INLINE void gemv_row( // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); const Index n4 = rows - 3; const Index n2 = rows - 1; +#endif // TODO: for padded aligned inputs, we could enable aligned reads enum { @@ -1952,8 +1971,8 @@ EIGEN_STRONG_INLINE void gemv_row( #else ResPacket c0, c1, c2, c3, c4, c5, c6, c7; #endif - ScalarBlock cc0, cc1, cc2, cc3; #ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; GEMV_PROCESS_ROW(8) GEMV_PROCESS_ROW(4) GEMV_PROCESS_ROW(2) @@ -2061,6 +2080,8 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe } else { \ cc##iter1 = predux_complex(&e0##iter1); \ } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ } #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ @@ -2084,6 +2105,8 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe #define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \ if (N > iter1) { \ cc##iter1 = predux_complex(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ } #define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \ @@ -2133,9 +2156,11 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe lhs.template load(i + (iter), j) #define GEMV_INIT_COMPLEX_OLD(iter, N) \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ if (N > iter) { \ c1##iter = pset_zero(); \ - EIGEN_UNUSED_VARIABLE(c0##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ } #define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \ @@ -2148,6 +2173,8 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe if (N > iter1) { \ cc##iter1.scalar[0] = predux(c1##iter2); \ cc##iter1.scalar[1] = predux(c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ } #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ @@ -2237,9 +2264,11 @@ EIGEN_STRONG_INLINE void gemv_complex_row( // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); const Index n4 = rows - 3; const Index n2 = rows - 1; +#endif // TODO: for padded aligned inputs, we could enable aligned reads enum { @@ -2258,12 +2287,12 @@ EIGEN_STRONG_INLINE void gemv_complex_row( GEMV_UNUSED_EXTRA(1, c0) GEMV_UNUSED_EXTRA(1, c1) #endif - ScalarBlock cc0, cc1, cc2, cc3; ResScalar dd0; -#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_MMA) +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; +#ifdef USE_GEMV_MMA if (!GEMV_IS_COMPLEX_COMPLEX) #endif -#ifndef GCC_ONE_VECTORPAIR_BUG { GEMV_PROCESS_ROW_COMPLEX(8) }