mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Performance improvements in GEMM for Power
This commit is contained in:
parent
e1df3636b2
commit
403fa33409
@ -114,11 +114,19 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||
{
|
||||
Packet2cf res;
|
||||
#ifdef __VSX__
|
||||
// Load a single std::complex<float> from memory and duplicate
|
||||
//
|
||||
// Using pload would read past the end of the reference in this case
|
||||
// Using vec_xl_len + vec_splat, generates poor assembly
|
||||
__asm__ ("lxvdsx %x0,%y1" : "=wa" (res.v) : "Z" (from));
|
||||
#else
|
||||
if((std::ptrdiff_t(&from) % 16) == 0)
|
||||
res.v = pload<Packet4f>((const float *)&from);
|
||||
else
|
||||
res.v = ploadu<Packet4f>((const float *)&from);
|
||||
res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -133,6 +141,7 @@ EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std
|
||||
{
|
||||
Packet4f res0, res1;
|
||||
#ifdef __VSX__
|
||||
// Load two std::complex<float> from memory and combine
|
||||
__asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
|
||||
__asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
|
||||
#ifdef _BIG_ENDIAN
|
||||
@ -186,7 +195,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
|
||||
{
|
||||
Packet4f rev_a;
|
||||
rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
|
||||
rev_a = vec_sld(a.v, a.v, 8);
|
||||
return Packet2cf(rev_a);
|
||||
}
|
||||
|
||||
@ -222,8 +231,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
|
||||
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
|
||||
{
|
||||
Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
|
||||
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
|
||||
Packet4f tmp = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
|
||||
kernel.packet[1].v = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
|
||||
kernel.packet[0].v = tmp;
|
||||
}
|
||||
|
||||
@ -358,7 +367,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
|
||||
{
|
||||
EIGEN_ALIGN16 std::complex<double> res[2];
|
||||
EIGEN_ALIGN16 std::complex<double> res[1];
|
||||
pstore<std::complex<double> >(res, a);
|
||||
|
||||
return res[0];
|
||||
@ -384,8 +393,8 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
|
||||
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
|
||||
{
|
||||
Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
|
||||
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
|
||||
Packet2d tmp = vec_mergeh(kernel.packet[0].v, kernel.packet[1].v);
|
||||
kernel.packet[1].v = vec_mergel(kernel.packet[0].v, kernel.packet[1].v);
|
||||
kernel.packet[0].v = tmp;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -19,10 +19,9 @@ EIGEN_ALWAYS_INLINE void gemm_extra_row(
|
||||
Index depth,
|
||||
Index strideA,
|
||||
Index offsetA,
|
||||
Index strideB,
|
||||
Index row,
|
||||
Index col,
|
||||
Index rows,
|
||||
Index cols,
|
||||
Index remaining_rows,
|
||||
const Packet& pAlpha,
|
||||
const Packet& pMask);
|
||||
@ -57,9 +56,7 @@ EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
|
||||
Index offsetA,
|
||||
Index strideB,
|
||||
Index row,
|
||||
Index col,
|
||||
Index rows,
|
||||
Index cols,
|
||||
Index remaining_rows,
|
||||
const Packet& pAlphaReal,
|
||||
const Packet& pAlphaImag,
|
||||
@ -83,79 +80,100 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
|
||||
const Packet& pAlphaImag,
|
||||
const Packet& pMask);
|
||||
|
||||
template<typename Scalar, typename Packet>
|
||||
EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
|
||||
template<typename Packet>
|
||||
EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
|
||||
|
||||
template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
|
||||
template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
|
||||
EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col);
|
||||
|
||||
template<typename Packet, int N>
|
||||
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
|
||||
template<typename DataMapper, typename Packet, typename Index, int N>
|
||||
EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
|
||||
|
||||
template<typename Packet, int N>
|
||||
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
|
||||
template<typename Packet, int N, bool mask>
|
||||
EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask);
|
||||
|
||||
// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
|
||||
template<typename Packet, typename Packetc, int N>
|
||||
EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
|
||||
{
|
||||
acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
|
||||
if (N > 1) {
|
||||
acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
|
||||
}
|
||||
if (N > 2) {
|
||||
acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
|
||||
}
|
||||
if (N > 3) {
|
||||
acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
|
||||
template<typename Packet, int N, bool mask>
|
||||
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask);
|
||||
|
||||
template<typename Packet, typename Packetc, int N, bool full>
|
||||
EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2);
|
||||
|
||||
#define MICRO_NORMAL(iter) \
|
||||
(accCols == accCols2) || (unroll_factor != (iter + 1))
|
||||
|
||||
#define MICRO_UNROLL_ITER(func, N) \
|
||||
switch (remaining_rows) { \
|
||||
default: \
|
||||
func(N, 0) \
|
||||
break; \
|
||||
case 1: \
|
||||
func(N, 1) \
|
||||
break; \
|
||||
case 2: \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
func(N, 2) \
|
||||
} \
|
||||
break; \
|
||||
case 3: \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
func(N, 3) \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
|
||||
if (N > 1) {
|
||||
acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
|
||||
}
|
||||
if (N > 2) {
|
||||
acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
|
||||
}
|
||||
if (N > 3) {
|
||||
acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
|
||||
}
|
||||
}
|
||||
#define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
|
||||
|
||||
template<typename Packet, typename Packetc, int N>
|
||||
EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
|
||||
{
|
||||
bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);
|
||||
|
||||
acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
|
||||
if (N > 1) {
|
||||
acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
|
||||
}
|
||||
if (N > 2) {
|
||||
acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
|
||||
}
|
||||
if (N > 3) {
|
||||
acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
|
||||
#define MICRO_LOAD1(lhs_ptr, iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
|
||||
lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsV##iter); \
|
||||
}
|
||||
|
||||
acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
|
||||
if (N > 1) {
|
||||
acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
|
||||
}
|
||||
if (N > 2) {
|
||||
acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
|
||||
}
|
||||
if (N > 3) {
|
||||
acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
|
||||
}
|
||||
}
|
||||
#define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
|
||||
|
||||
#define MICRO_COMPLEX_LOAD_ONE(iter) \
|
||||
if (!LhsIsReal && (unroll_factor > iter)) { \
|
||||
lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
|
||||
} \
|
||||
MICRO_LOAD1(lhs_ptr_real, iter) \
|
||||
|
||||
#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
|
||||
|
||||
#define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
|
||||
|
||||
#define MICRO_PREFETCH1(lhs_ptr, iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr, iter)
|
||||
|
||||
#define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter)
|
||||
|
||||
#define MICRO_UPDATE \
|
||||
if (accCols == accCols2) { \
|
||||
EIGEN_UNUSED_VARIABLE(pMask); \
|
||||
EIGEN_UNUSED_VARIABLE(offsetA); \
|
||||
row += unroll_factor*accCols; \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_UPDATE \
|
||||
MICRO_UPDATE \
|
||||
if(LhsIsReal || (accCols == accCols2)) { \
|
||||
EIGEN_UNUSED_VARIABLE(imag_delta2); \
|
||||
}
|
||||
|
||||
// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
|
||||
template<typename Scalar, typename Packet>
|
||||
EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
|
||||
{
|
||||
return ploadu<Packet>(rhs);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
@ -21,6 +21,9 @@
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
#if !__has_builtin(__builtin_vsx_disassemble_pair)
|
||||
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "../../InternalHeaderCheck.h"
|
||||
@ -29,44 +32,48 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Scalar, typename Packet>
|
||||
#define accColsC (accCols / 2)
|
||||
|
||||
EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
|
||||
{
|
||||
__builtin_mma_xxsetaccz(acc);
|
||||
}
|
||||
|
||||
template<typename DataMapper, typename Index, typename Packet, const Index accCols>
|
||||
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
|
||||
template<typename DataMapper, typename Index, typename Packet, const Index accCols, const Index accCols2>
|
||||
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
|
||||
{
|
||||
PacketBlock<Packet, 4> result;
|
||||
__builtin_mma_disassemble_acc(&result.packet, acc);
|
||||
|
||||
PacketBlock<Packet, 4> tRes;
|
||||
bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);
|
||||
bload<DataMapper, Packet, Index, 0, ColMajor, false, 4>(tRes, data, i, 0);
|
||||
|
||||
bscale<Packet, 4>(tRes, result, alpha);
|
||||
bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
|
||||
|
||||
data.template storePacketBlock<Packet, 4>(i, 0, tRes);
|
||||
bstore<DataMapper, Packet, Index, 4>(tRes, data, i);
|
||||
}
|
||||
|
||||
template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
|
||||
EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
|
||||
template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
|
||||
EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag)
|
||||
{
|
||||
const bool full = (accCols2 > accColsC);
|
||||
PacketBlock<Packet, 4> resultReal, resultImag;
|
||||
__builtin_mma_disassemble_acc(&resultReal.packet, accReal);
|
||||
__builtin_mma_disassemble_acc(&resultImag.packet, accImag);
|
||||
|
||||
PacketBlock<Packetc, 8> tRes;
|
||||
bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);
|
||||
bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
|
||||
|
||||
PacketBlock<Packet,4> taccReal, taccImag;
|
||||
bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
|
||||
PacketBlock<Packet, 4> taccReal, taccImag;
|
||||
bscalec<Packet, 4, (accCols != accCols2)>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask);
|
||||
|
||||
PacketBlock<Packetc, 4> acc1, acc2;
|
||||
bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);
|
||||
bcouple<Packet, Packetc, 4, full>(taccReal, taccImag, tRes, acc1, acc2);
|
||||
|
||||
data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
|
||||
data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
|
||||
bstore<DataMapper, Packetc, Index, 4>(acc1, data, i);
|
||||
if (full) {
|
||||
bstore<DataMapper, Packetc, Index, 4>(acc2, data, i + accColsC);
|
||||
}
|
||||
}
|
||||
|
||||
// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
|
||||
@ -81,18 +88,6 @@ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L
|
||||
}
|
||||
}
|
||||
|
||||
template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
|
||||
EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
|
||||
{
|
||||
__vector_pair* a0 = reinterpret_cast<__vector_pair *>(const_cast<Packet2d *>(&a.packet[0]));
|
||||
if(NegativeAccumulate)
|
||||
{
|
||||
__builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
|
||||
} else {
|
||||
__builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
|
||||
EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
|
||||
{
|
||||
@ -104,18 +99,13 @@ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con
|
||||
}
|
||||
}
|
||||
|
||||
template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
|
||||
EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
|
||||
{
|
||||
// Just for compilation
|
||||
}
|
||||
|
||||
template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
|
||||
EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
|
||||
template<typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
|
||||
EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi, const RhsPacket& rhsV, RhsPacket& rhsVi)
|
||||
{
|
||||
pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
|
||||
if(LhsIsReal) {
|
||||
pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
|
||||
EIGEN_UNUSED_VARIABLE(lhsVi);
|
||||
} else {
|
||||
if(!RhsIsReal) {
|
||||
pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
|
||||
@ -128,35 +118,33 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag
|
||||
}
|
||||
|
||||
// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
|
||||
template<typename Packet>
|
||||
EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet)* rhs)
|
||||
{
|
||||
return ploadu<Packet>(rhs);
|
||||
}
|
||||
|
||||
template<typename Scalar, typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
|
||||
{
|
||||
rhsV = ploadRhs<Scalar, Packet>(rhs);
|
||||
rhsV = ploadRhs<Packet>(rhs);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
|
||||
{
|
||||
rhsV.packet[0] = ploadRhs<double, Packet2d>(rhs);
|
||||
rhsV.packet[1] = ploadRhs<double, Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)));
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
|
||||
EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV)
|
||||
{
|
||||
#if EIGEN_COMP_LLVM
|
||||
__builtin_vsx_assemble_pair(&rhsV,
|
||||
reinterpret_cast<__vector unsigned char>(ploadRhs<double, Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
|
||||
reinterpret_cast<__vector unsigned char>(ploadRhs<double, Packet2d>(rhs)));
|
||||
reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
|
||||
reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
|
||||
#else
|
||||
__asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
|
||||
EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV)
|
||||
{
|
||||
// Just for compilation
|
||||
ploadRhsMMA(lhs, lhsV);
|
||||
}
|
||||
|
||||
// PEEL_MMA loop factor.
|
||||
@ -165,98 +153,116 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
|
||||
#define MICRO_MMA_UNROLL(func) \
|
||||
func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
|
||||
|
||||
#define MICRO_MMA_LOAD_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
|
||||
lhs_ptr##iter += accCols; \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsV##iter); \
|
||||
}
|
||||
#define MICRO_MMA_WORK(func, type, peel) \
|
||||
func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
|
||||
func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel)
|
||||
|
||||
#define MICRO_MMA_WORK_ONE(iter, type, peel) \
|
||||
if (unroll_factor > iter) { \
|
||||
pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \
|
||||
#define MICRO_MMA_WORK_TWO(iter, type, peel) \
|
||||
if (unroll_factor > iter) { \
|
||||
pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV2##iter.packet[peel & 1]); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
if (MICRO_NORMAL(iter)) { \
|
||||
ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##iter), plhsV##iter); \
|
||||
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##iter.packet), &plhsV##iter); \
|
||||
lhs_ptr##iter += accCols*2; \
|
||||
} else { \
|
||||
lhsV2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr##iter); \
|
||||
lhsV2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr##iter + accCols2); \
|
||||
lhs_ptr##iter += accCols2*2; \
|
||||
EIGEN_UNUSED_VARIABLE(plhsV##iter) \
|
||||
} \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsV2##iter); \
|
||||
EIGEN_UNUSED_VARIABLE(plhsV##iter) \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter)
|
||||
|
||||
#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
|
||||
if (PEEL_MMA > peel) { \
|
||||
Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
|
||||
ploadRhsMMA<Scalar, type>(rhs_ptr + (accRows * peel), rhsV##peel); \
|
||||
MICRO_MMA_UNROLL(func2); \
|
||||
func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
|
||||
func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \
|
||||
ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \
|
||||
MICRO_MMA_UNROLL(funcl) \
|
||||
MICRO_MMA_WORK(funcw, type, peel) \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(rhsV##peel); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
|
||||
#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
|
||||
if (PEEL_MMA > peel2) { \
|
||||
PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
|
||||
__vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
|
||||
ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV##peel1); \
|
||||
ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV##peel2); \
|
||||
MICRO_MMA_UNROLL(funcl2) \
|
||||
MICRO_MMA_WORK(funcw2, type, peel1) \
|
||||
MICRO_MMA_WORK(funcw2, type, peel2) \
|
||||
} else { \
|
||||
MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
|
||||
type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
|
||||
MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
|
||||
MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
|
||||
MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
|
||||
MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);
|
||||
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
|
||||
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
|
||||
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \
|
||||
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7)
|
||||
|
||||
#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
|
||||
#define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
|
||||
type rhsV0; \
|
||||
MICRO_MMA_TYPE_PEEL(func,func2,type,0);
|
||||
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
|
||||
|
||||
#define MICRO_MMA_ONE_PEEL \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
|
||||
} else { \
|
||||
MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
|
||||
} \
|
||||
rhs_ptr += (accRows * PEEL_MMA);
|
||||
#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
|
||||
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
|
||||
rhs_ptr += (accRows * size);
|
||||
|
||||
#define MICRO_MMA_ONE \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
|
||||
} else { \
|
||||
MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
|
||||
} \
|
||||
rhs_ptr += accRows;
|
||||
#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
|
||||
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
|
||||
rhs_ptr += (accRows * size);
|
||||
|
||||
#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
|
||||
|
||||
#define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
|
||||
|
||||
#define MICRO_MMA_DST_PTR_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
|
||||
bsetzeroMMA(&accZero##iter); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(accZero##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
|
||||
|
||||
#define MICRO_MMA_SRC_PTR_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
|
||||
}
|
||||
#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_SRC_PTR_ONE)
|
||||
|
||||
#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
|
||||
|
||||
#define MICRO_MMA_PREFETCH_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
|
||||
#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
|
||||
|
||||
#define MICRO_MMA_STORE_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
|
||||
storeAccumulator<DataMapper, Index, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
|
||||
|
||||
template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
|
||||
template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2>
|
||||
EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
|
||||
const DataMapper& res,
|
||||
const Scalar* lhs_base,
|
||||
const Scalar* rhs_base,
|
||||
Index depth,
|
||||
Index strideA,
|
||||
Index offsetA,
|
||||
Index& row,
|
||||
const Packet& pAlpha)
|
||||
const Packet& pAlpha,
|
||||
const Packet& pMask)
|
||||
{
|
||||
const Scalar* rhs_ptr = rhs_base;
|
||||
const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
|
||||
@ -265,8 +271,8 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
|
||||
MICRO_MMA_SRC_PTR
|
||||
MICRO_MMA_DST_PTR
|
||||
|
||||
Index k = 0;
|
||||
for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
|
||||
Index k = 0, depth2 = depth - PEEL_MMA;
|
||||
for(; k <= depth2; k += PEEL_MMA)
|
||||
{
|
||||
EIGEN_POWER_PREFETCH(rhs_ptr);
|
||||
MICRO_MMA_PREFETCH
|
||||
@ -278,9 +284,13 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
|
||||
}
|
||||
MICRO_MMA_STORE
|
||||
|
||||
row += unroll_factor*accCols;
|
||||
MICRO_UPDATE
|
||||
}
|
||||
|
||||
#define MICRO_MMA_UNROLL_ITER2(N, M) \
|
||||
gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
|
||||
if (M) return;
|
||||
|
||||
template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
|
||||
EIGEN_ALWAYS_INLINE void gemmMMA_cols(
|
||||
const DataMapper& res,
|
||||
@ -293,7 +303,6 @@ EIGEN_ALWAYS_INLINE void gemmMMA_cols(
|
||||
Index offsetB,
|
||||
Index col,
|
||||
Index rows,
|
||||
Index cols,
|
||||
Index remaining_rows,
|
||||
const Packet& pAlpha,
|
||||
const Packet& pMask)
|
||||
@ -306,42 +315,42 @@ EIGEN_ALWAYS_INLINE void gemmMMA_cols(
|
||||
|
||||
#define MAX_MMA_UNROLL 7
|
||||
while(row + MAX_MMA_UNROLL*accCols <= rows) {
|
||||
gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_MMA_UNROLL_ITER2(MAX_MMA_UNROLL, 0);
|
||||
}
|
||||
switch( (rows-row)/accCols ) {
|
||||
#if MAX_MMA_UNROLL > 7
|
||||
case 7:
|
||||
gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 6
|
||||
case 6:
|
||||
gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 5
|
||||
case 5:
|
||||
gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 4
|
||||
case 4:
|
||||
gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 3
|
||||
case 3:
|
||||
gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 2
|
||||
case 2:
|
||||
gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_MMA_UNROLL > 1
|
||||
case 1:
|
||||
gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
|
||||
MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 1)
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -351,7 +360,7 @@ EIGEN_ALWAYS_INLINE void gemmMMA_cols(
|
||||
|
||||
if(remaining_rows > 0)
|
||||
{
|
||||
gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
|
||||
gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
|
||||
}
|
||||
}
|
||||
|
||||
@ -366,16 +375,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
|
||||
const Packet pAlpha = pset1<Packet>(alpha);
|
||||
const Packet pMask = bmask<Packet>(remaining_rows);
|
||||
|
||||
typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
|
||||
|
||||
Index col = 0;
|
||||
for(; col + accRows <= cols; col += accRows)
|
||||
{
|
||||
gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
|
||||
gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
|
||||
}
|
||||
|
||||
gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
|
||||
if (col != cols)
|
||||
{
|
||||
gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
|
||||
}
|
||||
}
|
||||
|
||||
#define accColsC (accCols / 2)
|
||||
#define advanceRows ((LhsIsReal) ? 1 : 2)
|
||||
#define advanceCols ((RhsIsReal) ? 1 : 2)
|
||||
|
||||
@ -385,74 +398,104 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
|
||||
#define MICRO_COMPLEX_MMA_UNROLL(func) \
|
||||
func(0) func(1) func(2) func(3)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
|
||||
if(!LhsIsReal) { \
|
||||
lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
|
||||
} \
|
||||
lhs_ptr_real##iter += accCols; \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsV##iter); \
|
||||
EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
|
||||
}
|
||||
#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
|
||||
func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
|
||||
if (unroll_factor > iter) { \
|
||||
pgercMMA<Scalar, Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
|
||||
pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
|
||||
#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \
|
||||
if (unroll_factor > iter) { \
|
||||
pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV##peel, rhsVi##peel); \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \
|
||||
if (!LhsIsReal && (unroll_factor > iter)) { \
|
||||
if (MICRO_NORMAL(iter)) { \
|
||||
ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##iter + imag_delta), plhsVi##iter); \
|
||||
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##iter.packet), &plhsVi##iter); \
|
||||
} else { \
|
||||
lhsVi2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2); \
|
||||
lhsVi2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2 + accCols2); \
|
||||
EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
|
||||
} \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhsVi2##iter); \
|
||||
EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
|
||||
} \
|
||||
MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
|
||||
if (PEEL_COMPLEX_MMA > peel) { \
|
||||
Packet lhsV0, lhsV1, lhsV2, lhsV3; \
|
||||
Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
|
||||
ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
|
||||
ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \
|
||||
if(!RhsIsReal) { \
|
||||
ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
|
||||
ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
|
||||
} \
|
||||
MICRO_COMPLEX_MMA_UNROLL(func2); \
|
||||
func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
|
||||
MICRO_COMPLEX_MMA_UNROLL(funcl) \
|
||||
MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(rhsV##peel); \
|
||||
EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
|
||||
#define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
|
||||
if (PEEL_COMPLEX_MMA > peel2) { \
|
||||
PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
|
||||
PacketBlock<Packet,2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
|
||||
__vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
|
||||
__vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
|
||||
ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV##peel1); \
|
||||
ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV##peel2); \
|
||||
if(!RhsIsReal) { \
|
||||
ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi##peel1); \
|
||||
ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi##peel2); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(rhsVi##peel1); \
|
||||
EIGEN_UNUSED_VARIABLE(rhsVi##peel2); \
|
||||
} \
|
||||
MICRO_COMPLEX_MMA_UNROLL(funcl2) \
|
||||
MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
|
||||
MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
|
||||
} else { \
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
|
||||
type rhsV0, rhsV1, rhsV2, rhsV3; \
|
||||
type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
|
||||
type rhsV0, rhsVi0; \
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0);
|
||||
MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_ONE_PEEL \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
|
||||
} else { \
|
||||
MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
|
||||
} \
|
||||
rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \
|
||||
if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA);
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
|
||||
MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
|
||||
rhs_ptr_real += (accRows * size); \
|
||||
if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
|
||||
|
||||
#define MICRO_COMPLEX_MMA_ONE \
|
||||
if (sizeof(Scalar) == sizeof(float)) { \
|
||||
MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
|
||||
} else { \
|
||||
MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
|
||||
} \
|
||||
rhs_ptr_real += accRows; \
|
||||
if(!RhsIsReal) rhs_ptr_imag += accRows;
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
|
||||
MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
|
||||
rhs_ptr_real += (accRows * size); \
|
||||
if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
|
||||
|
||||
#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
bsetzeroMMA<Scalar, Packet>(&accReal##iter); \
|
||||
bsetzeroMMA<Scalar, Packet>(&accImag##iter); \
|
||||
bsetzeroMMA(&accReal##iter); \
|
||||
bsetzeroMMA(&accImag##iter); \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(accReal##iter); \
|
||||
EIGEN_UNUSED_VARIABLE(accImag##iter); \
|
||||
@ -460,44 +503,35 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
|
||||
|
||||
#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
|
||||
} else { \
|
||||
EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
|
||||
}
|
||||
#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
|
||||
#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
|
||||
|
||||
#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
|
||||
if (unroll_factor > iter) { \
|
||||
storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
|
||||
storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
|
||||
|
||||
template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
|
||||
template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
|
||||
EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
|
||||
const DataMapper& res,
|
||||
const Scalar* lhs_base,
|
||||
const Scalar* rhs_base,
|
||||
Index depth,
|
||||
Index strideA,
|
||||
Index offsetA,
|
||||
Index strideB,
|
||||
Index& row,
|
||||
const Packet& pAlphaReal,
|
||||
const Packet& pAlphaImag)
|
||||
const Packet& pAlphaImag,
|
||||
const Packet& pMask)
|
||||
{
|
||||
const Scalar* rhs_ptr_real = rhs_base;
|
||||
const Scalar* rhs_ptr_imag = NULL;
|
||||
const Index imag_delta = accCols*strideA;
|
||||
const Index imag_delta2 = accCols2*strideA;
|
||||
if(!RhsIsReal) {
|
||||
rhs_ptr_imag = rhs_base + accRows*strideB;
|
||||
} else {
|
||||
@ -510,8 +544,8 @@ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
|
||||
MICRO_COMPLEX_MMA_SRC_PTR
|
||||
MICRO_COMPLEX_MMA_DST_PTR
|
||||
|
||||
Index k = 0;
|
||||
for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA)
|
||||
Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
|
||||
for(; k <= depth2; k += PEEL_COMPLEX_MMA)
|
||||
{
|
||||
EIGEN_POWER_PREFETCH(rhs_ptr_real);
|
||||
if(!RhsIsReal) {
|
||||
@ -526,9 +560,13 @@ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
|
||||
}
|
||||
MICRO_COMPLEX_MMA_STORE
|
||||
|
||||
row += unroll_factor*accCols;
|
||||
MICRO_COMPLEX_UPDATE
|
||||
}
|
||||
|
||||
#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
|
||||
gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
|
||||
if (M) return;
|
||||
|
||||
template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
|
||||
EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
|
||||
const DataMapper& res,
|
||||
@ -541,7 +579,6 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
|
||||
Index offsetB,
|
||||
Index col,
|
||||
Index rows,
|
||||
Index cols,
|
||||
Index remaining_rows,
|
||||
const Packet& pAlphaReal,
|
||||
const Packet& pAlphaImag,
|
||||
@ -555,27 +592,27 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
|
||||
|
||||
#define MAX_COMPLEX_MMA_UNROLL 4
|
||||
while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
|
||||
gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
|
||||
MICRO_COMPLEX_MMA_UNROLL_ITER2(MAX_COMPLEX_MMA_UNROLL, 0);
|
||||
}
|
||||
switch( (rows-row)/accCols ) {
|
||||
#if MAX_COMPLEX_MMA_UNROLL > 4
|
||||
case 4:
|
||||
gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
|
||||
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_COMPLEX_MMA_UNROLL > 3
|
||||
case 3:
|
||||
gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
|
||||
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_COMPLEX_MMA_UNROLL > 2
|
||||
case 2:
|
||||
gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
|
||||
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
|
||||
break;
|
||||
#endif
|
||||
#if MAX_COMPLEX_MMA_UNROLL > 1
|
||||
case 1:
|
||||
gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
|
||||
MICRO_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -585,7 +622,7 @@ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
|
||||
|
||||
if(remaining_rows > 0)
|
||||
{
|
||||
gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
}
|
||||
}
|
||||
|
||||
@ -604,13 +641,18 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS
|
||||
const Scalar* blockA = (Scalar *) blockAc;
|
||||
const Scalar* blockB = (Scalar *) blockBc;
|
||||
|
||||
typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
|
||||
|
||||
Index col = 0;
|
||||
for(; col + accRows <= cols; col += accRows)
|
||||
{
|
||||
gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
}
|
||||
|
||||
gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
if (col != cols)
|
||||
{
|
||||
gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
|
||||
}
|
||||
}
|
||||
|
||||
#undef accColsC
|
||||
|
@ -503,7 +503,11 @@ EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) {
|
||||
|
||||
/** \internal packet conjugate with real & imaginary operation inverted */
|
||||
EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) {
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
|
||||
#else
|
||||
return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) {
|
||||
@ -555,12 +559,20 @@ EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a)
|
||||
/** \internal packet negate */
|
||||
EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return Packet2cf(vec_neg(a.v));
|
||||
#else
|
||||
return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return Packet1cd(vec_neg(a.v));
|
||||
#else
|
||||
return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \internal flip the real & imaginary results and negate */
|
||||
@ -637,13 +649,24 @@ EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __POWER8_VECTOR__
|
||||
const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B };
|
||||
|
||||
const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
|
||||
#endif
|
||||
|
||||
/** \internal load two vectors from the interleaved real & imaginary values of src */
|
||||
template<typename RhsScalar>
|
||||
EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i)
|
||||
{
|
||||
Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
|
||||
#ifdef __POWER8_VECTOR__
|
||||
r = vec_mergee(t, t);
|
||||
i = vec_mergeo(t, t);
|
||||
#else
|
||||
r = vec_perm(t, t, p16uc_MERGEE);
|
||||
i = vec_perm(t, t, p16uc_MERGEO);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename RhsScalar>
|
||||
@ -909,7 +932,7 @@ EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, A
|
||||
{
|
||||
PResPacket c2 = pcplxflipconj(c0);
|
||||
PResPacket c3 = pcplxflipconj(c1);
|
||||
#if EIGEN_COMP_LLVM
|
||||
#if EIGEN_COMP_LLVM || !defined(_ARCH_PWR10)
|
||||
ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
|
||||
ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
|
||||
PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
|
||||
|
@ -83,8 +83,10 @@ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
|
||||
static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
|
||||
static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
|
||||
static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
|
||||
#ifndef __POWER8_VECTOR__
|
||||
static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
|
||||
static EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
|
||||
#endif
|
||||
static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
|
||||
#ifndef __VSX__
|
||||
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
|
||||
@ -102,11 +104,13 @@ static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
|
||||
static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
|
||||
static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
|
||||
#ifndef _ARCH_PWR9
|
||||
static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
|
||||
#endif
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
|
||||
static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
|
||||
static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
|
||||
#endif
|
||||
static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
|
||||
static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
|
||||
|
||||
@ -116,15 +120,11 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3
|
||||
// Define global static constants:
|
||||
#ifdef _BIG_ENDIAN
|
||||
static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
|
||||
#ifdef __VSX__
|
||||
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||
#endif
|
||||
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
||||
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
||||
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
||||
#else
|
||||
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
|
||||
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
||||
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
||||
static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
||||
@ -137,12 +137,6 @@ static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;
|
||||
|
||||
static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||
#else
|
||||
static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||
#endif // _BIG_ENDIAN
|
||||
|
||||
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
||||
#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
||||
#else
|
||||
@ -788,8 +782,22 @@ template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a,
|
||||
template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
|
||||
template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return vec_neg(a);
|
||||
#else
|
||||
return p4f_ZERO - a;
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return vec_neg(a);
|
||||
#else
|
||||
return p4i_ZERO - a;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
|
||||
@ -953,7 +961,10 @@ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
#ifdef _BIG_ENDIAN
|
||||
#if defined(__VSX__) || !defined(_BIG_ENDIAN)
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
||||
#else
|
||||
Packet16uc MSQ, LSQ;
|
||||
Packet16uc mask;
|
||||
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
||||
@ -961,9 +972,6 @@ template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPAC
|
||||
mask = vec_lvsl(0, from); // create the permute mask
|
||||
//TODO: Add static_cast here
|
||||
return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
#else
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1001,7 +1009,7 @@ template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNP
|
||||
Packet p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
|
||||
else p = ploadu<Packet>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
return vec_mergeh(p, p);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||
{
|
||||
@ -1017,7 +1025,7 @@ template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*
|
||||
Packet8s p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
|
||||
else p = ploadu<Packet8s>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE16_HI);
|
||||
return vec_mergeh(p, p);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
|
||||
@ -1025,7 +1033,7 @@ template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned shor
|
||||
Packet8us p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
|
||||
else p = ploadu<Packet8us>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE16_HI);
|
||||
return vec_mergeh(p, p);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
|
||||
@ -1054,7 +1062,7 @@ template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char*
|
||||
Packet16c p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
|
||||
else p = ploadu<Packet16c>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE8_HI);
|
||||
return vec_mergeh(p, p);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
|
||||
@ -1062,13 +1070,15 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned ch
|
||||
Packet16uc p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
|
||||
else p = ploadu<Packet16uc>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE8_HI);
|
||||
return vec_mergeh(p, p);
|
||||
}
|
||||
|
||||
template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_STORE
|
||||
#ifdef _BIG_ENDIAN
|
||||
#if defined(__VSX__) || !defined(_BIG_ENDIAN)
|
||||
vec_xst(from, 0, to);
|
||||
#else
|
||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
||||
// Warning: not thread safe!
|
||||
Packet16uc MSQ, LSQ, edges;
|
||||
@ -1083,8 +1093,6 @@ template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE_
|
||||
LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
|
||||
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
|
||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
|
||||
#else
|
||||
vec_xst(from, 0, to);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
|
||||
@ -1164,11 +1172,19 @@ template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
|
||||
{
|
||||
#ifdef _ARCH_PWR9
|
||||
return vec_revb(a);
|
||||
#else
|
||||
return vec_perm(a, a, p16uc_REVERSE8);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
|
||||
{
|
||||
#ifdef _ARCH_PWR9
|
||||
return vec_revb(a);
|
||||
#else
|
||||
return vec_perm(a, a, p16uc_REVERSE8);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
|
||||
{
|
||||
@ -2102,7 +2118,11 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
||||
template<typename Packet> EIGEN_STRONG_INLINE
|
||||
Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
|
||||
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
||||
#ifdef __POWER8_VECTOR__
|
||||
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_neg(reinterpret_cast<Packet4i>(select)));
|
||||
#else
|
||||
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
|
||||
#endif
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
|
||||
@ -2117,7 +2137,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
|
||||
template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
|
||||
Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
||||
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
|
||||
#ifdef __POWER8_VECTOR__
|
||||
Packet8us mask = reinterpret_cast<Packet8us>(vec_neg(reinterpret_cast<Packet8s>(select)));
|
||||
#else
|
||||
Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
|
||||
#endif
|
||||
Packet8s result = vec_sel(elsePacket, thenPacket, mask);
|
||||
return result;
|
||||
}
|
||||
@ -2125,7 +2149,11 @@ template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, cons
|
||||
template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
|
||||
Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
||||
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
|
||||
#ifdef __POWER8_VECTOR__
|
||||
Packet8us mask = reinterpret_cast<Packet8us>(vec_neg(reinterpret_cast<Packet8s>(select)));
|
||||
#else
|
||||
Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
|
||||
#endif
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
|
||||
@ -2139,7 +2167,11 @@ template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, co
|
||||
ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
|
||||
ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
|
||||
|
||||
#ifdef __POWER8_VECTOR__
|
||||
Packet16uc mask = reinterpret_cast<Packet16uc>(vec_neg(reinterpret_cast<Packet16c>(select)));
|
||||
#else
|
||||
Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
|
||||
#endif
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
|
||||
@ -2149,7 +2181,11 @@ template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, c
|
||||
ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
|
||||
ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
|
||||
|
||||
#ifdef __POWER8_VECTOR__
|
||||
Packet16uc mask = reinterpret_cast<Packet16uc>(vec_neg(reinterpret_cast<Packet16c>(select)));
|
||||
#else
|
||||
Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
|
||||
#endif
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
|
||||
@ -2395,7 +2431,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return vec_neg(a);
|
||||
#else
|
||||
return p2d_ZERO - a;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
||||
|
||||
@ -2487,7 +2530,7 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIG
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
|
||||
{
|
||||
return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
|
||||
return vec_sld(a, a, 8);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
|
||||
|
||||
@ -2692,8 +2735,8 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
Packet2d t0, t1;
|
||||
t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
|
||||
t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
|
||||
t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
|
||||
t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
|
||||
kernel.packet[0] = t0;
|
||||
kernel.packet[1] = t1;
|
||||
}
|
||||
|
@ -213,6 +213,11 @@ public:
|
||||
return ploadt<PacketT, AlignmentT>(&operator()(i, j));
|
||||
}
|
||||
|
||||
template<typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType &p) const {
|
||||
pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
@ -311,6 +316,11 @@ public:
|
||||
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
|
||||
}
|
||||
|
||||
template<typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType &p) const {
|
||||
pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
|
Loading…
x
Reference in New Issue
Block a user