From 6c58f0fe1ffc0ca49f8785f74c855626af907c80 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 13 Mar 2023 23:36:06 +0000 Subject: [PATCH] Revert changes that made BF16 GEMM to cause bad register spillage for LLVM (Power) --- .../arch/AltiVec/MatrixProductMMAbfloat16.h | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h index 4774587f5..b30c4f8e3 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h @@ -146,8 +146,8 @@ EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f zeroAccumulators(quad_acc); - Index k = 0; - for(Index j = depth >> 1; j--; k += 2){ + Index k; + for(k = 0; k + 2 <= depth; k += 2){ KLoop(indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows); } if(depth&1){ @@ -185,9 +185,7 @@ void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f template EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result) { - if (MAX_BFLOAT16_ACC > num_acc) { - colLoopBody(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result); - } + colLoopBody(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result); } template @@ -415,7 +413,7 @@ EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Inde template EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc) { - for(Index j = (rows - i) / size; j--; i += size, dst += size*resInc){ + for(; i + size <= rows; i += size, dst += size*resInc){ PacketBlock r32; r32.packet[0] = convertF32toBF16(result + i + 0); if (size >= 16) { @@ -569,12 +567,11 @@ void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMa zeroAccumulators(quad_acc); LhsMapper lhs2 = lhs.getSubMapper(row, 0); - Index j = 0; - for(Index k = cend >> 1; k--; j += 2) { + for(Index j = 0; j + 2 <= cend; j += 2) { vecColLoop(j, lhs2, rhs, quad_acc); } if (cend & 1) { - vecColLoop(j, lhs2, rhs, quad_acc); + vecColLoop(cend - 1, lhs2, rhs, quad_acc); } disassembleAccumulators(quad_acc, acc); @@ -588,9 +585,7 @@ void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMa template EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result) { - if (MAX_BFLOAT16_VEC_ACC > num_acc) { - colVecColLoopBody(row, cend, rows, lhs, rhs, pAlpha, result); - } + colVecColLoopBody(row, cend, rows, lhs, rhs, pAlpha, result); } template @@ -769,7 +764,7 @@ template EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc], Index extra_cols) { Index j = 0; - for(Index k = cols >> 3; k--; j += 8) { + for(; j + 8 <= cols; j += 8){ multVecLoop(quad_acc, lhs, rhs, j, extra_cols); }