Partial Packet support for GEMM real-only (PowerPC). Also fix compilation warnings & errors for some conditions in new API.

This commit is contained in:
Chip Kerchner 2022-08-03 18:15:19 +00:00 committed by Rasmus Munk Larsen
parent 5a1c7807e6
commit ce60a7be83
5 changed files with 167 additions and 21 deletions

View File

@ -1159,6 +1159,29 @@ EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& re
} }
} }
#ifdef USE_PARTIAL_PACKETS
template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements)
{
for (Index M = 0; M < N; M++) {
acc.packet[M] = res.template loadPacketPartial<Packet>(row, M, elements);
}
if (Complex && full) {
for (Index M = 0; M < N; M++) {
acc.packet[M+N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
}
}
}
template<typename DataMapper, typename Packet, Index N>
EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements)
{
for (Index M = 0; M < N; M++) {
res.template storePacketPartial<Packet>(row, M, acc.packet[M], elements);
}
}
#endif
#ifdef _ARCH_PWR10 #ifdef _ARCH_PWR10
#define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11)) #define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
#else #else
@ -1199,6 +1222,14 @@ EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows)
#endif #endif
} }
template<typename Packet, int N>
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
{
for (int M = 0; M < N; M++) {
acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
}
}
// Scale the PacketBlock vectors by alpha. // Scale the PacketBlock vectors by alpha.
template<typename Packet, int N, bool mask> template<typename Packet, int N, bool mask>
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask) EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
@ -1209,9 +1240,7 @@ EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N
EIGEN_UNUSED_VARIABLE(pMask); EIGEN_UNUSED_VARIABLE(pMask);
} }
for (int M = 0; M < N; M++) { bscale<Packet, N>(acc, accZ, pAlpha);
acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
}
} }
template<typename Packet, int N, bool real> template<typename Packet, int N, bool real>
@ -1461,6 +1490,13 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0); MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
} }
#ifdef USE_PARTIAL_PACKETS
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(pMask);
bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row, remaining_rows);
bscale<Packet,accRows>(acc, accZero0, pAlpha);
bstore_partial<DataMapper, Packet, accRows>(acc, res, row, remaining_rows);
#else
bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0); bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
if ((accRows == 1) || (rows >= accCols)) if ((accRows == 1) || (rows >= accCols))
{ {
@ -1474,6 +1510,7 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
} }
} }
} }
#endif
} }
#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \ #define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \
@ -1565,16 +1602,35 @@ EIGEN_ALWAYS_INLINE void gemm_extra_row(
#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
#ifdef USE_PARTIAL_PACKETS
#define MICRO_STORE_ONE(iter) \
if (unroll_factor > iter) { \
if (MICRO_NORMAL_PARTIAL(iter)) { \
bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
} else { \
bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter*accCols, accCols2); \
bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter*accCols, accCols2); \
} \
}
#else
#define MICRO_STORE_ONE(iter) \ #define MICRO_STORE_ONE(iter) \
if (unroll_factor > iter) { \ if (unroll_factor > iter) { \
bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \ bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \ bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \ bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
} }
#endif
#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
#ifdef USE_PARTIAL_PACKETS
template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, bool full>
#else
template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2> template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
#endif
EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
const DataMapper& res, const DataMapper& res,
const Scalar* lhs_base, const Scalar* lhs_base,
@ -1585,7 +1641,12 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
Index strideB, Index strideB,
Index& row, Index& row,
const Packet& pAlpha, const Packet& pAlpha,
const Packet& pMask) #ifdef USE_PARTIAL_PACKETS
Index accCols2
#else
const Packet& pMask
#endif
)
{ {
const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL; const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
@ -1612,9 +1673,15 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
MICRO_UPDATE MICRO_UPDATE
} }
#ifdef USE_PARTIAL_PACKETS
#define MICRO_UNROLL_ITER2(N, M) \
gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
if (M) return;
#else
#define MICRO_UNROLL_ITER2(N, M) \ #define MICRO_UNROLL_ITER2(N, M) \
gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \ gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
if (M) return; if (M) return;
#endif
template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols> template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
EIGEN_ALWAYS_INLINE void gemm_cols( EIGEN_ALWAYS_INLINE void gemm_cols(
@ -2094,22 +2161,22 @@ EIGEN_ALWAYS_INLINE void gemm_complex_cols(
switch( (rows-row)/accCols ) { switch( (rows-row)/accCols ) {
#if MAX_COMPLEX_UNROLL > 4 #if MAX_COMPLEX_UNROLL > 4
case 4: case 4:
MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4)
break; break;
#endif #endif
#if MAX_COMPLEX_UNROLL > 3 #if MAX_COMPLEX_UNROLL > 3
case 3: case 3:
MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3)
break; break;
#endif #endif
#if MAX_COMPLEX_UNROLL > 2 #if MAX_COMPLEX_UNROLL > 2
case 2: case 2:
MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2)
break; break;
#endif #endif
#if MAX_COMPLEX_UNROLL > 1 #if MAX_COMPLEX_UNROLL > 1
case 1: case 1:
MICRO_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1)
break; break;
#endif #endif
default: default:

View File

@ -5,6 +5,10 @@
#define EIGEN_POWER_PREFETCH(p) #define EIGEN_POWER_PREFETCH(p)
#endif #endif
#ifdef _ARCH_PWR9
#define USE_PARTIAL_PACKETS
#endif
#include "../../InternalHeaderCheck.h" #include "../../InternalHeaderCheck.h"
namespace Eigen { namespace Eigen {
@ -89,6 +93,17 @@ EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const D
template<typename DataMapper, typename Packet, int N> template<typename DataMapper, typename Packet, int N>
EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row); EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
#ifdef USE_PARTIAL_PACKETS
template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements);
template<typename DataMapper, typename Packet, Index N>
EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements);
#endif
template<typename Packet, int N>
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
template<typename Packet, int N, bool mask> template<typename Packet, int N, bool mask>
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask); EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask);
@ -101,7 +116,7 @@ EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Pa
#define MICRO_NORMAL(iter) \ #define MICRO_NORMAL(iter) \
(accCols == accCols2) || (unroll_factor != (iter + 1)) (accCols == accCols2) || (unroll_factor != (iter + 1))
#define MICRO_UNROLL_ITER(func, N) \ #define MICRO_UNROLL_ITER1(func, N) \
switch (remaining_rows) { \ switch (remaining_rows) { \
default: \ default: \
func(N, 0) \ func(N, 0) \
@ -121,6 +136,22 @@ EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Pa
break; \ break; \
} }
#ifdef USE_PARTIAL_PACKETS
#define MICRO_UNROLL_ITER(func, N) \
if (remaining_rows) { \
func(N, true); \
} else { \
func(N, false); \
}
#define MICRO_NORMAL_PARTIAL(iter) \
full || (unroll_factor != (iter + 1))
#else
#define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
#endif
#define MICRO_COMPLEX_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
#define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b) #define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
#define MICRO_LOAD1(lhs_ptr, iter) \ #define MICRO_LOAD1(lhs_ptr, iter) \
@ -161,9 +192,15 @@ EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Pa
#define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter) #define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter)
#ifdef USE_PARTIAL_PACKETS
#define MICRO_UPDATE_MASK
#else
#define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
#endif
#define MICRO_UPDATE \ #define MICRO_UPDATE \
if (accCols == accCols2) { \ if (accCols == accCols2) { \
EIGEN_UNUSED_VARIABLE(pMask); \ MICRO_UPDATE_MASK \
EIGEN_UNUSED_VARIABLE(offsetA); \ EIGEN_UNUSED_VARIABLE(offsetA); \
row += unroll_factor*accCols; \ row += unroll_factor*accCols; \
} }

View File

@ -39,18 +39,34 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
__builtin_mma_xxsetaccz(acc); __builtin_mma_xxsetaccz(acc);
} }
#ifdef USE_PARTIAL_PACKETS
template<typename DataMapper, typename Packet, bool full>
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
#else
template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2> template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2>
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc) EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
#endif
{ {
PacketBlock<Packet, 4> result; PacketBlock<Packet, 4> result;
__builtin_mma_disassemble_acc(&result.packet, acc); __builtin_mma_disassemble_acc(&result.packet, acc);
PacketBlock<Packet, 4> tRes; PacketBlock<Packet, 4> tRes;
#ifdef USE_PARTIAL_PACKETS
if (full) {
EIGEN_UNUSED_VARIABLE(elements);
bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
bscale<Packet, 4>(tRes, result, alpha);
bstore<DataMapper, Packet, 4>(tRes, data, i);
} else {
bload_partial<DataMapper, Packet, 0, false, 4>(tRes, data, i, elements);
bscale<Packet, 4>(tRes, result, alpha);
bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
}
#else
bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0); bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask); bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
bstore<DataMapper, Packet, 4>(tRes, data, i); bstore<DataMapper, Packet, 4>(tRes, data, i);
#endif
} }
template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2> template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
@ -270,14 +286,25 @@ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV)
#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE) #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
#ifdef USE_PARTIAL_PACKETS
#define MICRO_MMA_STORE_ONE(iter) \
if (unroll_factor > iter) { \
storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(iter)>(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \
}
#else
#define MICRO_MMA_STORE_ONE(iter) \ #define MICRO_MMA_STORE_ONE(iter) \
if (unroll_factor > iter) { \ if (unroll_factor > iter) { \
storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \ storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
} }
#endif
#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
#ifdef USE_PARTIAL_PACKETS
template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full>
#else
template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2> template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
#endif
EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
const DataMapper& res, const DataMapper& res,
const Scalar* lhs_base, const Scalar* lhs_base,
@ -287,7 +314,12 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
Index offsetA, Index offsetA,
Index& row, Index& row,
const Packet& pAlpha, const Packet& pAlpha,
const Packet& pMask) #ifdef USE_PARTIAL_PACKETS
Index accCols2
#else
const Packet& pMask
#endif
)
{ {
const Scalar* rhs_ptr = rhs_base; const Scalar* rhs_ptr = rhs_base;
const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
@ -312,9 +344,15 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
MICRO_UPDATE MICRO_UPDATE
} }
#ifdef USE_PARTIAL_PACKETS
#define MICRO_MMA_UNROLL_ITER2(N, M) \
gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
if (M) return;
#else
#define MICRO_MMA_UNROLL_ITER2(N, M) \ #define MICRO_MMA_UNROLL_ITER2(N, M) \
gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \ gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
if (M) return; if (M) return;
#endif
template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols> template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
EIGEN_ALWAYS_INLINE void gemmMMA_cols( EIGEN_ALWAYS_INLINE void gemmMMA_cols(
@ -643,22 +681,22 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
switch( (rows-row)/accCols ) { switch( (rows-row)/accCols ) {
#if MAX_COMPLEX_MMA_UNROLL > 4 #if MAX_COMPLEX_MMA_UNROLL > 4
case 4: case 4:
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4)
break; break;
#endif #endif
#if MAX_COMPLEX_MMA_UNROLL > 3 #if MAX_COMPLEX_MMA_UNROLL > 3
case 3: case 3:
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
break; break;
#endif #endif
#if MAX_COMPLEX_MMA_UNROLL > 2 #if MAX_COMPLEX_MMA_UNROLL > 2
case 2: case 2:
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
break; break;
#endif #endif
#if MAX_COMPLEX_MMA_UNROLL > 1 #if MAX_COMPLEX_MMA_UNROLL > 1
case 1: case 1:
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1) MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
break; break;
#endif #endif
default: default:

View File

@ -513,6 +513,7 @@ EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* f
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_ALIGNED_LOAD EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_UNUSED_VARIABLE(from); EIGEN_UNUSED_VARIABLE(from);
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
@ -645,6 +646,7 @@ template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPAC
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_UNUSED_VARIABLE(to); EIGEN_UNUSED_VARIABLE(to);
EIGEN_DEBUG_ALIGNED_STORE EIGEN_DEBUG_ALIGNED_STORE
Packet store = from; Packet store = from;
@ -1215,6 +1217,7 @@ template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const
eigen_assert(n <= packet_size && "number of elements will read past end of packet"); eigen_assert(n <= packet_size && "number of elements will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_ALIGNED_LOAD EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
@ -1402,6 +1405,7 @@ template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPA
eigen_assert(n <= packet_size && "number of elements will write past end of packet"); eigen_assert(n <= packet_size && "number of elements will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_UNALIGNED_STORE EIGEN_DEBUG_UNALIGNED_STORE
vec_xst_len(from, to, n * size); vec_xst_len(from, to, n * size);
#else #else

View File

@ -292,7 +292,7 @@ public:
} }
template<typename PacketType> template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/ = 0) const {
return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n); return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n);
} }
@ -302,7 +302,7 @@ public:
} }
template<typename PacketType> template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/ = 0) const {
pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n); pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n);
} }
@ -343,7 +343,7 @@ public:
} }
template<typename PacketType> template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/ = 0) const {
return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n); return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n);
} }
@ -358,7 +358,7 @@ public:
} }
template<typename PacketType> template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/ = 0) const {
pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n); pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
} }