diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index e24b5d5b3..a40d4cbb0 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -129,7 +129,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * reason why packing for complex is broken down into several different parts, also the reason why we endup having a * float32/64 and complex float32/64 version. **/ -template +template EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; @@ -148,7 +148,7 @@ EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_b return v; } -template +template EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; @@ -166,7 +166,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc { for(Index k = 0; k < vectorSize; k++) { - std::complex v = getAdjointVal(i, j + k, rhs); + std::complex v = getAdjointVal(i, j + k, rhs); blockBf[rir + k] = v.real(); blockBf[rii + k] = v.imag(); @@ -184,7 +184,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc for(Index i = k2; i < depth; i++) { - std::complex v = getAdjointVal(i, j, rhs); + std::complex v = getAdjointVal(i, j, rhs); blockBf[rir] = v.real(); blockBf[rii] = v.imag(); @@ -197,7 +197,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc } } -template +template EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; @@ -215,7 +215,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc { for(Index k = 0; k < vectorSize; k++) { - std::complex v = getAdjointVal(j+k, i, lhs); + std::complex v = getAdjointVal(j+k, i, lhs); blockAf[rir + k] = v.real(); blockAf[rii + k] = v.imag(); @@ -236,7 +236,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc Index k = j; for(; k < rows; k++) { - std::complex v = getAdjointVal(k, i, lhs); + std::complex v = getAdjointVal(k, i, lhs); blockAf[rir] = v.real(); blockAf[rii] = v.imag(); @@ -248,7 +248,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc } } -template +template EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; @@ -285,7 +285,7 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs } } -template +template EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; @@ -332,7 +332,7 @@ struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -341,7 +341,7 @@ struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrde { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -352,7 +352,7 @@ struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -361,7 +361,7 @@ struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrd { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -371,7 +371,7 @@ struct symm_pack_rhs { void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -380,7 +380,7 @@ struct symm_pack_lhs { void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -390,7 +390,7 @@ struct symm_pack_rhs { void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -399,7 +399,7 @@ struct symm_pack_lhs { void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -414,7 +414,7 @@ struct symm_pack_lhs * and offset and behaves accordingly. **/ -template +template EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); @@ -429,7 +429,7 @@ EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) } // General template for lhs & rhs complex packing. -template +template struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -451,9 +451,9 @@ struct dhs_cpack { PacketBlock cblock; if (UseLhs) { - bload(cblock, lhs, j, i); + bload(cblock, lhs, j, i); } else { - bload(cblock, lhs, i, j); + bload(cblock, lhs, i, j); } blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); @@ -480,8 +480,8 @@ struct dhs_cpack { ptranspose(blocki); } - storeBlock(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 4*vectorSize; rii += 4*vectorSize; @@ -579,7 +579,7 @@ struct dhs_cpack { }; // General template for lhs & rhs packing. -template +template struct dhs_pack{ EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -597,16 +597,16 @@ struct dhs_pack{ PacketBlock block; if (UseLhs) { - bload(block, lhs, j, i); + bload(block, lhs, j, i); } else { - bload(block, lhs, i, j); + bload(block, lhs, i, j); } if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { ptranspose(block); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 4*vectorSize; } @@ -675,8 +675,8 @@ struct dhs_pack{ }; // General template for lhs packing, float64 specialization. -template -struct dhs_pack +template +struct dhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -703,7 +703,7 @@ struct dhs_pack(j, i + 1); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 2*vectorSize; } @@ -742,8 +742,8 @@ struct dhs_pack -struct dhs_pack +template +struct dhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { @@ -780,7 +780,7 @@ struct dhs_pack(i + 1, j + 0); //[b1 b2] block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] - storeBlock(blockB + ri, block); + storeBlock(blockB + ri, block); } ri += 4*vectorSize; @@ -827,8 +827,8 @@ struct dhs_pack -struct dhs_cpack +template +struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -882,8 +882,8 @@ struct dhs_cpack(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -940,8 +940,8 @@ struct dhs_cpack -struct dhs_cpack +template +struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { @@ -962,7 +962,7 @@ struct dhs_cpack cblock; PacketBlock blockr, blocki; - bload(cblock, rhs, i, j); + bload(cblock, rhs, i, j); blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); @@ -976,8 +976,8 @@ struct dhs_cpack(blockBt + rir, blockr); - storeBlock(blockBt + rii, blocki); + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -1123,7 +1123,7 @@ EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock +template EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { @@ -1147,7 +1147,7 @@ EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const D } } -template +template EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& res, Index row) { for (int M = 0; M < N; M++) { @@ -1165,7 +1165,7 @@ EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& re const static Packet4i mask4[4] = { { 0, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, -1, 0, 0 }, { -1, -1, -1, 0 } }; #endif -template +template EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) { #if USE_P10_AND_PVIPR2_0 @@ -1180,7 +1180,7 @@ EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) } template<> -EIGEN_ALWAYS_INLINE Packet2d bmask(const Index remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const Index remaining_rows) { #if USE_P10_AND_PVIPR2_0 Packet2d mask2 = Packet2d(vec_gendm(remaining_rows)); @@ -1406,7 +1406,7 @@ EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock +template EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr0, @@ -1419,7 +1419,7 @@ EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( lhs_ptr += remaining_rows; } -template +template EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -1454,14 +1454,14 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( } for(; k < depth; k++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0); + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0); } - bload(acc, res, row, 0); + bload(acc, res, row, 0); if ((accRows == 1) || (rows >= accCols)) { bscale(acc, accZero0, pAlpha, pMask); - bstore(acc, res, row); + bstore(acc, res, row); } else { bscale(acc, accZero0, pAlpha, pMask); for(Index j = 0; j < accRows; j++) { @@ -1490,9 +1490,9 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( } #define MICRO_EXTRA_ROWS(N) \ - gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask); + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask); -template +template EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, @@ -1563,14 +1563,14 @@ EIGEN_ALWAYS_INLINE void gemm_extra_row( #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - bload(acc, res, row + iter*accCols, 0); \ + bload(acc, res, row + iter*accCols, 0); \ bscale(acc, accZero##iter, pAlpha, pMask); \ - bstore(acc, res, row + iter*accCols); \ + bstore(acc, res, row + iter*accCols); \ } #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) -template +template EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -1609,10 +1609,10 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( } #define MICRO_UNROLL_ITER2(N, M) \ - gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \ + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \ if (M) return; -template +template EIGEN_ALWAYS_INLINE void gemm_cols( const DataMapper& res, const Scalar* blockA, @@ -1681,14 +1681,14 @@ EIGEN_ALWAYS_INLINE void gemm_cols( if(remaining_rows > 0) { - gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); } } #define MICRO_EXTRA_COLS(N) \ - gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); -template +template EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, const Scalar* blockA, @@ -1711,7 +1711,7 @@ EIGEN_STRONG_INLINE void gemm_extra_cols( /**************** * GEMM kernels * * **************/ -template +template EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; @@ -1725,12 +1725,12 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Index col = 0; for(; col + accRows <= cols; col += accRows) { - gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); } if (col != cols) { - gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } } @@ -1828,7 +1828,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ MICRO_COMPLEX_ADD_PEEL(1, 0) -template +template EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2, @@ -1840,7 +1840,7 @@ EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( MICRO_COMPLEX_ADD_COLS(1) } -template +template EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -1888,18 +1888,18 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( } for(; k < depth; k++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0); + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0); } - const bool full = (remaining_rows > accColsC); - bload(tRes, res, row, 0); + constexpr bool full = (remaining_rows > accColsC); + bload(tRes, res, row, 0); if ((accRows == 1) || (rows >= accCols)) { bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); bcouple(taccReal, taccImag, tRes, acc0, acc1); - bstore(acc0, res, row + 0); + bstore(acc0, res, row + 0); if (full) { - bstore(acc1, res, row + accColsC); + bstore(acc1, res, row + accColsC); } } else { bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); @@ -1911,7 +1911,7 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( res(row + 0, j) = pfirst(acc0.packet[j]); } } else { - bstore(acc0, res, row + 0); + bstore(acc0, res, row + 0); if (full) { for(Index j = 0; j < accRows; j++) { res(row + accColsC, j) = pfirst(acc1.packet[j]); @@ -1922,9 +1922,9 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( } #define MICRO_COMPLEX_EXTRA_ROWS(N) \ - gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask); + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask); -template +template EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, @@ -1998,19 +1998,19 @@ EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - const bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \ - bload(tRes, res, row + iter*accCols, 0); \ + constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \ + bload(tRes, res, row + iter*accCols, 0); \ bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \ bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - bstore(acc0, res, row + iter*accCols + 0); \ + bstore(acc0, res, row + iter*accCols + 0); \ if (full) { \ - bstore(acc1, res, row + iter*accCols + accColsC); \ + bstore(acc1, res, row + iter*accCols + accColsC); \ } \ } #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) -template +template EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -2057,10 +2057,10 @@ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration( } #define MICRO_COMPLEX_UNROLL_ITER2(N, M) \ - gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ + gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ if (M) return; -template +template EIGEN_ALWAYS_INLINE void gemm_complex_cols( const DataMapper& res, const Scalar* blockA, @@ -2115,14 +2115,14 @@ EIGEN_ALWAYS_INLINE void gemm_complex_cols( if(remaining_rows > 0) { - gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } #define MICRO_COMPLEX_EXTRA_COLS(N) \ - gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); -template +template EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, const Scalar* blockA, @@ -2143,7 +2143,7 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_cols( MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true) } -template +template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; @@ -2161,12 +2161,12 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl Index col = 0; for(; col + accRows <= cols; col += accRows) { - gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } if (col != cols) { - gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } @@ -2189,7 +2189,7 @@ template ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2203,7 +2203,7 @@ template ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2218,7 +2218,7 @@ template ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2232,7 +2232,7 @@ template ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } #endif @@ -2247,7 +2247,7 @@ template ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2261,7 +2261,7 @@ template ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2275,7 +2275,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2289,7 +2289,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2304,7 +2304,7 @@ template ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2318,7 +2318,7 @@ template ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } #endif @@ -2333,7 +2333,7 @@ template, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2347,7 +2347,7 @@ template, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2361,7 +2361,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2375,7 +2375,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2389,7 +2389,7 @@ template, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2403,7 +2403,7 @@ template, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2431,16 +2431,16 @@ void gebp_kernel; + gemm_function = &Eigen::internal::gemmMMA; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemmMMA; + gemm_function = &Eigen::internal::gemmMMA; } else{ - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; } #else - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2470,16 +2470,16 @@ void gebp_kernel, std::complex, Index, DataMapper, mr #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2508,16 +2508,16 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjugat Index, Index, Index, std::complex, Index, Index, Index, Index); #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2546,16 +2546,16 @@ void gebp_kernel, float, Index, DataMapper, mr, nr, Conjugat Index, Index, Index, std::complex, Index, Index, Index, Index); #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } else{ - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } #else - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2583,16 +2583,16 @@ void gebp_kernel; + gemm_function = &Eigen::internal::gemmMMA; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemmMMA; + gemm_function = &Eigen::internal::gemmMMA; } else{ - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; } #else - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2621,16 +2621,16 @@ void gebp_kernel, std::complex, Index, DataMapper, Index, Index, Index, std::complex, Index, Index, Index, Index); #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; } #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2659,16 +2659,16 @@ void gebp_kernel, double, Index, DataMapper, mr, nr, Conjug Index, Index, Index, std::complex, Index, Index, Index, Index); #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } else{ - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; } #else - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } @@ -2697,16 +2697,16 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjug Index, Index, Index, std::complex, Index, Index, Index, Index); #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; } #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; #endif gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index e68c595c7..70b95da3c 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -11,7 +11,7 @@ namespace Eigen { namespace internal { -template +template EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, @@ -26,7 +26,7 @@ EIGEN_ALWAYS_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask); -template +template EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, const Scalar* blockA, @@ -43,10 +43,10 @@ EIGEN_STRONG_INLINE void gemm_extra_cols( const Packet& pAlpha, const Packet& pMask); -template +template EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows); -template +template EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, @@ -62,7 +62,7 @@ EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask); -template +template EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, const Scalar* blockA, @@ -83,10 +83,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_cols( template EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs); -template +template EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); -template +template EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& res, Index row); template diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 17c7ce129..84ba11576 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -39,30 +39,30 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) __builtin_mma_xxsetaccz(acc); } -template +template EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; - bload(tRes, data, i, 0); + bload(tRes, data, i, 0); bscale(tRes, result, alpha, pMask); - bstore(tRes, data, i); + bstore(tRes, data, i); } -template +template EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag) { - const bool full = (accCols2 > accColsC); + constexpr bool full = (accCols2 > accColsC); PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); __builtin_mma_disassemble_acc(&resultImag.packet, accImag); PacketBlock tRes; - bload(tRes, data, i, 0); + bload(tRes, data, i, 0); PacketBlock taccReal, taccImag; bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask); @@ -70,9 +70,9 @@ EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data PacketBlock acc1, acc2; bcouple(taccReal, taccImag, tRes, acc1, acc2); - bstore(acc1, data, i); + bstore(acc1, data, i); if (full) { - bstore(acc2, data, i + accColsC); + bstore(acc2, data, i + accColsC); } } @@ -163,13 +163,13 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) #define MICRO_MMA_WORK_ONE(iter, type, peel) \ if (unroll_factor > iter) { \ - pgerMMA(&accZero##iter, rhsV##peel, lhsV##iter); \ + pgerMMA(&accZero##iter, rhsV[peel], lhsV##iter); \ } #ifdef VECTOR_PAIR_LOADS_LHS #define MICRO_MMA_WORK_TWO(iter, type, peel) \ if (unroll_factor > iter) { \ - pgerMMA(&accZero##iter, rhsV##peel, lhsV2##iter.packet[peel & 1]); \ + pgerMMA(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \ } #define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \ @@ -195,16 +195,14 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \ if (PEEL_MMA > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \ + ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \ MICRO_MMA_UNROLL(funcl) \ MICRO_MMA_WORK(funcw, type, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } #ifndef VECTOR_PAIR_LOADS_LHS #define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ + type rhsV[8]; \ MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \ MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \ MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \ @@ -214,17 +212,25 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) if (PEEL_MMA > peel2) { \ PacketBlock lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \ - ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV##peel1); \ - ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV##peel2); \ + if (sizeof(type) == 16) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr + (accRows * peel1)), prhsV##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsV[peel1]), &prhsV##peel1); \ + } else { \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \ + ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \ + } \ MICRO_MMA_UNROLL(funcl2) \ MICRO_MMA_WORK(funcw2, type, peel1) \ MICRO_MMA_WORK(funcw2, type, peel2) \ } else { \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \ } #define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ + type rhsV[8]; \ + __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \ MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \ MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \ MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \ @@ -232,7 +238,7 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) #endif #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ - type rhsV0; \ + type rhsV[1]; \ MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) #define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \ @@ -266,12 +272,12 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeAccumulator(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \ + storeAccumulator(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \ } #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) -template +template EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -307,10 +313,10 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( } #define MICRO_MMA_UNROLL_ITER2(N, M) \ - gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \ + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \ if (M) return; -template +template EIGEN_ALWAYS_INLINE void gemmMMA_cols( const DataMapper& res, const Scalar* blockA, @@ -379,11 +385,11 @@ EIGEN_ALWAYS_INLINE void gemmMMA_cols( if(remaining_rows > 0) { - gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); } } -template +template void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; @@ -399,12 +405,12 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index col = 0; for(; col + accRows <= cols; col += accRows) { - gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); + gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); } if (col != cols) { - gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } } @@ -422,13 +428,13 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \ if (unroll_factor > iter) { \ - pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \ } #ifdef VECTOR_PAIR_LOADS_LHS #define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \ if (unroll_factor > iter) { \ - pgercMMA(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV##peel, rhsVi##peel); \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \ } #define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \ @@ -454,23 +460,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, if (PEEL_COMPLEX_MMA > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3; \ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ - ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \ if(!RhsIsReal) { \ - ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \ } \ MICRO_COMPLEX_MMA_UNROLL(funcl) \ MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } #ifndef VECTOR_PAIR_LOADS_LHS #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ + type rhsV[4], rhsVi[4]; \ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3) #else @@ -480,31 +480,44 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, PacketBlock lhsVi20, lhsVi21, lhsVi22, lhsVi23; \ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \ __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \ - ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV##peel1); \ - ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV##peel2); \ - if(!RhsIsReal) { \ - ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi##peel1); \ - ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi##peel2); \ + if (sizeof(type) == 16) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsV[peel1]), &prhsV##peel1); \ + if(!RhsIsReal) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsVi[peel1]), &prhsVi##peel1); \ + } else { \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ + } \ } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel1); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel2); \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \ + } \ } \ MICRO_COMPLEX_MMA_UNROLL(funcl2) \ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \ } else { \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \ } #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ + type rhsV[4], rhsVi[4]; \ + __vector_pair prhsV0, prhsV2; \ + __vector_pair prhsVi0, prhsVi2; \ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) #endif #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ - type rhsV0, rhsVi0; \ + type rhsV[1], rhsVi[1]; \ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) #define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \ @@ -542,12 +555,12 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \ + storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \ } #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) -template +template EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, @@ -597,10 +610,10 @@ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( } #define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \ - gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ + gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ if (M) return; -template +template EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( const DataMapper& res, const Scalar* blockA, @@ -655,11 +668,11 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( if(remaining_rows > 0) { - gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } -template +template void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; @@ -679,12 +692,12 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS Index col = 0; for(; col + accRows <= cols; col += accRows) { - gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } if (col != cols) { - gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h index 6ab4d0bf2..9eaf4144b 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h @@ -375,7 +375,7 @@ EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, c } #endif -template +template EIGEN_STRONG_INLINE void gemv_col( Index rows, Index cols, const LhsMapper& alhs, @@ -927,7 +927,7 @@ EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, Re } } -template +template EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) { PResPacket c2 = pcplxflipconj(c0); @@ -953,7 +953,7 @@ EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, A } /** \internal load lhs packet */ -template +template EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) { if (sizeof(Scalar) == sizeof(LhsScalar)) { @@ -1337,17 +1337,17 @@ EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock)) { if (ConjugateLhs) { - result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0]))); + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; } } else { result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]); @@ -1394,7 +1394,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock> 1) #define GEMV_LOADPACKET_COL_COMPLEX(iter) \ - loadLhsPacket(lhs, i + ((iter) * ResPacketSize), j) + loadLhsPacket(lhs, i + ((iter) * ResPacketSize), j) #define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \ convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter)) @@ -1444,7 +1444,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock iter1) { \ @@ -1498,7 +1498,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock(&e0##iter, result0##iter); \ + disassembleResults(&e0##iter, result0##iter); #define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \ if (GEMV_GETN_COMPLEX(N) > iter) { \ @@ -1520,13 +1520,13 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock(c0##iter2, c0##iter3, alpha_data, res + i); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ } else { \ c0##iter3 = PResPacket(result0##iter2.packet[2]); \ - pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ c0##iter2 = PResPacket(result0##iter3.packet[0]); \ c0##iter3 = PResPacket(result0##iter3.packet[2]); \ - pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ } \ } @@ -1607,7 +1607,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock +template EIGEN_STRONG_INLINE void gemv_complex_col( Index rows, Index cols, const LhsMapper& alhs, @@ -1725,10 +1725,6 @@ static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, template EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) { - union { - ScalarBlock cs; - double cd; - } cc0; PacketBlock result0, result1; __builtin_mma_disassemble_acc(&result0.packet, acc0); __builtin_mma_disassemble_acc(&result1.packet, acc1); @@ -1737,20 +1733,17 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, _ result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]); result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3); result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3])); - cc0.cd = pfirst(reinterpret_cast(result0.packet[0])); - return cc0.cs; + return *reinterpret_cast *>(&result0.packet[0]); } template<> EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) { - ScalarBlock cc0; PacketBlock result0, result1; __builtin_mma_disassemble_acc(&result0.packet, acc0); __builtin_mma_disassemble_acc(&result1.packet, acc1); - cc0.scalar[0] = result0.packet[0][0] + result0.packet[1][1]; - cc0.scalar[1] = result1.packet[0][0] + result1.packet[1][1]; - return cc0; + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1])); + return *reinterpret_cast *>(&result0.packet[0]); } /** \internal add complex results together */ @@ -1766,17 +1759,17 @@ EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(Packet result0.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[3]), reinterpret_cast(result1.packet[3]))); result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]); if (ConjugateLhs) { - result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0]))); - result0.packet[1] = convertReal(pcplxflip2(convertComplex(result0.packet[1]))); + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; } else if (ConjugateRhs) { - result0.packet[1] = convertReal(pcplxconjflip(convertComplex(result0.packet[1]))); + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; } else { - result0.packet[1] = convertReal(pcplxflipconj(convertComplex(result0.packet[1]))); + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; } result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); } else { if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex))) { - result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0]))); + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; } } cc0.scalar[0].real(result0.packet[0][0]); @@ -1807,12 +1800,10 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0 template EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0) { - ScalarBlock cc0; PacketBlock result0; __builtin_mma_disassemble_acc(&result0.packet, acc0); - cc0.scalar[0] = result0.packet[0][0] + result0.packet[1][1]; - cc0.scalar[1] = result0.packet[2][0] + result0.packet[3][1]; - return cc0; + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); } template @@ -1823,25 +1814,25 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0 __builtin_mma_disassemble_acc(&result0.packet, acc0); if (GEMV_IS_COMPLEX_COMPLEX) { if (ConjugateLhs) { - result0.packet[1] = convertReal(pconjinv(convertComplex(result0.packet[1]))); - result0.packet[3] = convertReal(pconjinv(convertComplex(result0.packet[3]))); + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; } else if (ConjugateRhs) { - result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0]))); - result0.packet[2] = convertReal(pconj2(convertComplex(result0.packet[2]))); + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; } else { - result0.packet[1] = convertReal(pconj2(convertComplex(result0.packet[1]))); - result0.packet[3] = convertReal(pconj2(convertComplex(result0.packet[3]))); + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; } - cc0.scalar[0].real(result0.packet[0][0] + result0.packet[1][1]); - cc0.scalar[0].imag(result0.packet[0][1] + result0.packet[1][0]); - cc0.scalar[1].real(result0.packet[2][0] + result0.packet[3][1]); - cc0.scalar[1].imag(result0.packet[2][1] + result0.packet[3][0]); + result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2)); + result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2)); } else { - cc0.scalar[0].real(result0.packet[0][0]); - cc0.scalar[0].imag(result0.packet[1][1]); - cc0.scalar[1].real(result0.packet[2][0]); - cc0.scalar[1].imag(result0.packet[3][1]); + result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1); + result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1); } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[2][0]); + cc0.scalar[1].imag(result0.packet[2][1]); return cc0; } #endif @@ -1957,7 +1948,7 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPa GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \ } -template +template EIGEN_STRONG_INLINE void gemv_row( Index rows, Index cols, const LhsMapper& alhs, @@ -2040,7 +2031,7 @@ struct general_matrix_vector_product(rows, cols, lhs, rhs, res, resIncr, alpha); \ + gemv_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ } \ }; @@ -2056,7 +2047,7 @@ struct general_matrix_vector_product(rows, cols, lhs, rhs, res, resIncr, alpha); \ + gemv_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ } \ }; @@ -2076,7 +2067,7 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe } #define GEMV_LOADPACKET_ROW_COMPLEX(iter) \ - loadLhsPacket(lhs, i + (iter), j) + loadLhsPacket(lhs, i + (iter), j) #define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \ convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter)) @@ -2276,7 +2267,7 @@ EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PRe GEMV_PROCESS_ROW_COMPLEX_ONE(N) #endif -template +template EIGEN_STRONG_INLINE void gemv_complex_row( Index rows, Index cols, const LhsMapper& alhs, @@ -2367,7 +2358,7 @@ struct general_matrix_vector_product(rows, cols, lhs, rhs, res, resIncr, alpha); \ + gemv_complex_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ } \ }; @@ -2383,7 +2374,7 @@ struct general_matrix_vector_product(rows, cols, lhs, rhs, res, resIncr, alpha); \ + gemv_complex_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ } \ };