From 3adc78e39c625411e58a1715f9584cb1794e5dc9 Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Thu, 12 Jan 2023 16:33:33 +0000 Subject: [PATCH] Altivec fixes for Darwin: do not use unsupported VSX insns (cherry picked from commit 4d05765345e7e4a984d600039f797e2fede924f3) --- Eigen/src/Core/arch/AltiVec/Complex.h | 12 +- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 19 +- .../Core/arch/AltiVec/MatrixVectorProduct.h | 2400 +++++++++++++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 24 +- Eigen/src/Core/util/ConfigureVectorization.h | 4 +- Eigen/src/Core/util/Macros.h | 15 +- 6 files changed, 2447 insertions(+), 27 deletions(-) create mode 100644 Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 965f4911a..c745c4742 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX #if defined(_BIG_ENDIAN) static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -100,7 +100,7 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSXs HasBlend = 1, #endif HasSetLinear = 0 @@ -130,7 +130,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex& from0, const std::complex& from1) { Packet4f res0, res1; -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); #ifdef _BIG_ENDIAN @@ -230,7 +230,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV))); } -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { Packet2cf result; result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); @@ -244,7 +244,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) } //---------- double ---------- -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} @@ -403,7 +403,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) return psqrt_complex(a); } -#endif // __VSX__ +#endif // EIGEN_VECTORIZE_VSX } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 2b7c204e3..3b0c07fac 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -40,7 +40,22 @@ Packet4f pcos(const Packet4f& _x) return pcos_float(_x); } -#ifdef __VSX__ +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) +{ + return vec_rsqrt(x); +} +#endif + +#ifdef EIGEN_VECTORIZE_VSX +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) +{ + return vec_rsqrt(x); +} +#endif template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) @@ -88,7 +103,7 @@ template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); } -#endif // __VSX__ +#endif // EIGEN_VECTORIZE_VSX // Hyperbolic Tangent function. template <> diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h new file mode 100644 index 000000000..bb84ac977 --- /dev/null +++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h @@ -0,0 +1,2400 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + +#include "../../InternalHeaderCheck.h" + +#if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA +#if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3) +#define USE_GEMV_MMA +#endif + +#if !EIGEN_COMP_LLVM && (__GNUC__ == 10 && __GNUC_MINOR__ <= 3) +// Only allow one vector_pair in buggy gcc - gcc 10.3 has a bug +#define GCC_ONE_VECTORPAIR_BUG +#endif +#endif + +//#define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved) + +//#define EIGEN_POWER_USE_GEMV_PREFETCH +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p) +#else +#define EIGEN_POWER_GEMV_PREFETCH(p) +#endif + +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if EIGEN_COMP_LLVM +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#if (__GNUC__ <= 10) +#if (__GNUC_MINOR__ > 3) +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#endif + +#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16)) +#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float))) +#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16) +#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex))) + +/** \internal multiply and add and store results */ +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) +{ + pstoreu(res, pmadd(data, palpha, ploadu(res))); +} + +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) +{ + *res += (alpha * data); +} + +#define GEMV_UNROLL(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) \ + func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2) + +#define GEMV_LOADPACKET_COL(iter) \ + lhs.template load(i + ((iter) * LhsPacketSize), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_VAR(iter, N, which) \ + if (GEMV_GETN(N) <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \ + if (N <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which) + +#define GEMV_UNUSED(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&e##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1)); +#else +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \ + b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src##iter1)); +#endif + +#define GEMV_LOAD1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + g##iter = GEMV_LOADPACKET_COL(iter); \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } + +#define GEMV_WORK1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&e##iter, a0, g##iter); \ + } else { \ + pger_vecMMA_acc(&e##iter, b##iter, a0); \ + } \ + } + +#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2) \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \ + GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(g##iter2); \ + EIGEN_UNUSED_VARIABLE(g##iter3); + +#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + LhsPacket h[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(h), &b##iter2); \ + pger_vecMMA_acc(&e##iter2, a0, h[0]); \ + pger_vecMMA_acc(&e##iter3, a0, h[1]); \ + } else { \ + pger_vecMMA_acc(&e##iter2, b##iter2, a0); \ + pger_vecMMA_acc(&e##iter3, b##iter3, a0); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \ + } + +#define GEMV_WORK_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) + +#define GEMV_WORK_COL_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \ + if (!GEMV_IS_FLOAT) { \ + result##iter.packet[0][1] = result##iter.packet[1][0]; \ + result##iter.packet[2][1] = result##iter.packet[3][0]; \ + } \ + } + +#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \ + b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)); + +#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + ResPacket f##iter2[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(f##iter2), &b##iter2); \ + f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \ + f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \ + GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]); +#else +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + if (GEMV_IS_FLOAT) { \ + __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } else { \ + __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } +#endif + +#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \ + } else { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \ + GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \ + } \ + } + +#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \ + *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1; + +#define GEMV_STORE_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + storeMaddData(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \ + GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \ + } \ + } + +#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \ + } else { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \ + } \ + } + +#define GEMV_PROCESS_COL_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_MMA, N) \ + Index j = j2; \ + __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \ + do { \ + LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_MMA(N) \ + GEMV_WORK_COL_MMA(N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \ + if (GEMV_GETN(N) <= 1) { \ + GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define GEMV_PREFETCH(iter, N) \ + if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \ + lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \ + } +#else +#define GEMV_PREFETCH(iter, N) +#endif + +#define GEMV_WORK_COL(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \ + } + +#define GEMV_STORE_COL(iter, N) \ + if (N > iter) { \ + pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu(res + i + (iter * ResPacketSize)))); \ + } + +/** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_ONE(N) \ + GEMV_UNROLL(GEMV_INIT, N) \ + Index j = j2; \ + do { \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL, N) \ + i += (ResPacketSize * N); + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE_MMA(N) +#else +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE(N) +#endif + +/** \internal perform a matrix multiply and accumulate of packet a and packet b */ +#ifdef USE_GEMV_MMA +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b); + } +} +#endif + +template +EIGEN_STRONG_INLINE void gemv_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + conj_helper pcj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + ResPacket palpha = pset1(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#ifdef USE_GEMV_MMA + __vector_quad e0, e1, e2, e3, e4, e5, e6, e7; + PacketBlock result0, result1, result2, result3, result4, result5, result6, result7; + GEMV_UNUSED(8, e) + GEMV_UNUSED(8, result) + GEMV_UNUSED_EXTRA(1, c) +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + while (i < n8) + { + GEMV_PROCESS_COL(8) + } + if (i < n4) + { + GEMV_PROCESS_COL(4) + } + if (i < n2) + { + GEMV_PROCESS_COL(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb }; +const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 }; + +#ifdef _BIG_ENDIAN +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +#else +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +#endif + +#ifdef _BIG_ENDIAN +#define COMPLEX_DELTA 0 +#else +#define COMPLEX_DELTA 2 +#endif + +/** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */ +EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) { + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR))); +} + +/** \internal packet conjugate with real & imaginary operation inverted */ +EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) { +#ifdef __POWER8_VECTOR__ + return Packet2cf(Packet4f(vec_neg(Packet2d(a.v)))); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR2))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR2))); +} + +#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12) +#define PERMXOR_GOOD // Clang had a bug with vec_permxor and endianness prior to version 12 +#endif + +/** \internal flip the real & imaginary results and packet conjugate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +/** \internal packet conjugate and flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +/** \internal packet negate */ +EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) +{ +#ifdef __POWER8_VECTOR__ + return Packet2cf(vec_neg(a.v)); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_NEGATE))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) +{ +#ifdef __POWER8_VECTOR__ + return Packet1cd(vec_neg(a.v)); +#else + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_NEGATE))); +#endif +} + +/** \internal flip the real & imaginary results and negate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +/** \internal flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) +{ + return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) +{ +#ifdef EIGEN_VECTORIZE_VSX + return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2)); +#else + return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP))); +#endif +} + +/** \internal load half a vector with one complex value */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex* src) +{ + Packet4f t; +#ifdef EIGEN_VECTORIZE_VSX + // Load float64/two float32 (doubleword alignment) + __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src)); +#else + *reinterpret_cast*>(reinterpret_cast(&t) + COMPLEX_DELTA) = *src; +#endif + return t; +} + +/** \internal load two vectors from the real and imaginary portions of a complex value */ +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) +{ +#ifdef _ARCH_PWR9 + __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet4f t = pload_complex_half(src); + r = vec_splat(t, COMPLEX_DELTA + 0); + i = vec_splat(t, COMPLEX_DELTA + 1); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) +{ +#ifdef EIGEN_VECTORIZE_VSX + __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet2d t = ploadu(reinterpret_cast(src)); + r = vec_splat(t, 0); + i = vec_splat(t, 1); +#endif +} + +#ifndef __POWER8_VECTOR__ +const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B }; + +const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +#endif + +/** \internal load two vectors from the interleaved real & imaginary values of src */ +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) +{ + Packet4f t = ploadu(reinterpret_cast(src)); +#ifdef __POWER8_VECTOR__ + r = vec_mergee(t, t); + i = vec_mergeo(t, t); +#else + r = vec_perm(t, t, p16uc_MERGEE); + i = vec_perm(t, t, p16uc_MERGEO); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) +{ + return pload_realimag(src, r, i); +} + +/** \internal load and splat a complex value into a vector - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex* src) +{ +#ifdef EIGEN_VECTORIZE_VSX + Packet4f ret; + __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast(src) + 0))); + return ret; +#else + return Packet4f(ploaddup(reinterpret_cast(src))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a complex value into a vector - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a scalar or a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex* src) +{ + if (GEMV_IS_SCALAR) { + return pload_complex_half(src); + } + else + { + return ploadu(reinterpret_cast(src)); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex* src) +{ + return ploadu(reinterpret_cast(src)); +} + +/** \internal load from a complex vector and convert to a real vector */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) +{ + return src->v; +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) +{ + return src->v; +} + +/** \internal load a full vector from complex location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex* src) +{ + return Packet4f(ploaddup(reinterpret_cast(src))); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a full vector from complex location - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex* src) +{ + return pload_complex_full(src); +} + +/** \internal load a vector from a real-only scalar location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) +{ + return src; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) +{ + return src; +} + +/** \internal load a vector from a real-only vector location */ +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) +{ + Packet4f ret = ploadu(src); + return vec_mergeh(ret, ret); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +/** \internal load a vector from a real-only scalar location - row-wise */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) +{ + if (GEMV_IS_SCALAR) { + return pload_real_full(src); + } + else { + return ploadu(src); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +/** \internal set a scalar from complex location */ +template +EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) +{ + return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag()); +} + +/** \internal set a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex& alpha) +{ + Packet2cf ret; + ret.v[COMPLEX_DELTA + 0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[COMPLEX_DELTA + 1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0]; + ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1]; + return ret; +} + +template +EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex& alpha) +{ + Packet1cd ret; + ret.v[0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + return ret; +} + +/** \internal zero out a vector for real or complex forms */ +template +EIGEN_ALWAYS_INLINE Packet pset_zero() +{ + return pset1(__UNPACK_TYPE__(Packet)(0)); +} + +template<> +EIGEN_ALWAYS_INLINE Packet2cf pset_zero() +{ + return Packet2cf(pset1(float(0))); +} + +template<> +EIGEN_ALWAYS_INLINE Packet1cd pset_zero() +{ + return Packet1cd(pset1(double(0))); +} + +/** \internal initialize a vector from another vector */ +template +EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + EIGEN_UNUSED_VARIABLE(c1); + return pset_zero(); + } + else + { + return c1; // Intentionally left uninitialized + } +} + +template +struct alpha_store +{ + alpha_store(ResScalar& alpha) { + separate.r = pset1_complex(alpha); + separate.i = pset1_complex(alpha); + } + struct ri { + PResPacket r; + PResPacket i; + } separate; +}; + +/** \internal multiply and add for complex math */ +template +EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) +{ + return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4)); +} + +/** \internal store and madd for complex math */ +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + if (GEMV_IS_SCALAR) { + ScalarPacket c4 = ploadu(reinterpret_cast(res)); + ScalarPacket c3 = pmadd_complex(c0.v, c2.v, c4, b0); + pstoreu(reinterpret_cast(res), c3); + } else { + ScalarPacket c4 = pload_complex(res); + PResPacket c3 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + pstoreu(res, c3); + } +} + +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + PResPacket c3 = pcplxflipconj(c1); +#if !defined(_ARCH_PWR10) + ScalarPacket c4 = pload_complex(res + (iter2 * ResPacketSize)); + ScalarPacket c5 = pload_complex(res + ((iter2 + 1) * ResPacketSize)); + PResPacket c6 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + PResPacket c7 = PResPacket(pmadd_complex(c1.v, c3.v, c5, b0)); + pstoreu(res + (iter2 * ResPacketSize), c6); + pstoreu(res + ((iter2 + 1) * ResPacketSize), c7); +#else + __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)); +#if EIGEN_COMP_LLVM + PResPacket c6[2]; + __builtin_vsx_disassemble_pair(reinterpret_cast(c6), &a); + c6[0] = PResPacket(pmadd_complex(c0.v, c2.v, c6[0].v, b0)); + c6[1] = PResPacket(pmadd_complex(c1.v, c3.v, c6[1].v, b0)); + GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v); +#else + if (GEMV_IS_COMPLEX_FLOAT) { + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } else { + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } +#endif + *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a; +#endif +} + +/** \internal load lhs packet */ +template +EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) +{ + if (sizeof(Scalar) == sizeof(LhsScalar)) { + const LhsScalar& src = lhs(i + 0, j); + return LhsPacket(pload_real_full(const_cast(&src))); + } + return lhs.template load(i + 0, j); +} + +/** \internal madd for complex times complex */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (ConjugateLhs && ConjugateRhs) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return vec_nmsub(a, b, c); + } + else { + return vec_madd(a, b, c); + } +} + +/** \internal madd for complex times real */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (Conjugate) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else { + return vec_madd(a, b, c); + } +} + +template +EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + conj_helper pcj; + RhsPacket b0; + if (StorageOrder == ColMajor) { + b0 = pset1(*b); + } + else { + b0 = ploadu(b); + } + c0 = pcj.pmadd(a0, b0, c0); +} + +/** \internal core multiply operation for vectors - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) +{ + ScalarPacket br, bi; + if (StorageOrder == ColMajor) { + pload_realimag(b, br, bi); + } + else { + pload_realimag_row(b, br, bi); + } + if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0); + LhsPacket a1 = pcplxflipconj(a0); + ScalarPacket cr = pmadd_complex_complex(a0.v, br, c0.v); + ScalarPacket ci = pmadd_complex_complex(a1.v, bi, c1.v); + c1 = ResPacket(ci); + c0 = PResPacket(cr); +} + +/** \internal core multiply operation for vectors - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + ScalarPacket cri = pmadd_complex_real(a0, b0, c0.v); + c0 = PResPacket(cri); +} + +/** \internal core multiply operation for vectors - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket a1 = pload_complex(&a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + ScalarPacket cri = pmadd_complex_real(a1, b0, c0.v); + c0 = PResPacket(cri); +} + +#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \ +{ \ + gemv_mult_complex_complex(a0, b, c0, c1); \ +} + +GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex, Packet2cf) +GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex, Packet1cd) + +#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \ +{ \ + gemv_mult_real_complex(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX(float, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(double, std::complex, Packet1cd) +GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex, Packet1cd) + +#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \ +{ \ + gemv_mult_complex_real(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, double, Packet1cd, std::complex) + +#ifdef USE_GEMV_MMA +/** \internal convert packet to real form */ +template +EIGEN_ALWAYS_INLINE T convertReal(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) +{ + return a.v; +} + +EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) +{ + return a.v; +} + +/** \internal convert packet to complex form */ +template +EIGEN_ALWAYS_INLINE T convertComplex(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) +{ + return Packet2cf(a); +} + +EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) +{ + return Packet1cd(a); +} + +/** \internal load a vector from a complex location (for MMA version) */ +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) +{ + a = SLhsPacket(pload_complex(&a)); +} + +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) +{ + // Pass thru +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) +{ + // Just for compilation +} + +/** \internal madd for complex times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, b2, a.v); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, b, a.v); + } + else { + return pger_vecMMA(c, b, a.v); + } +} + +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, a, b); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal madd for complex times real (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + RealPacket a2 = convertReal(a); + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b2, a2); + } else { + return pger_vecMMA(c, a2, b2); + } + } + else { + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b, a2); + } else { + return pger_vecMMA(c, a2, b); + } + } +} + +/** \internal madd for real times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_realimag_combine(b); + } else { + b0 = pload_realimag_combine_row(b); + } + pmadd_complex_complex_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + pload_complex_MMA(a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + pmadd_complex_real_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + pmadd_complex_real_MMA)) ? StorageOrder : ColMajor>(a0, b0, c0); +} + +#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex) + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex* b, __vector_quad* c0) +{ + if (sizeof(LhsScalar) == 16) { + gemv_mult_complex_complex_MMA(a0, b, c0); + } + else { + gemv_mult_real_complex_MMA(a0, b, c0); + } +} + +#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_real_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex) +GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex) + +#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_real_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float) +GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double) + +/** \internal disassemble MMA accumulator results into packets */ +template +EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (sizeof(LhsPacket) == 16) { + if (sizeof(RhsPacket) == 16) { + ScalarPacket tmp0, tmp2; + tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]); + tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]); + result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]); + result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]); + result0.packet[2] = tmp2; + result0.packet[0] = tmp0; + + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } else { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]); + } else { + result0.packet[0][1] = result0.packet[1][1]; + result0.packet[2][1] = result0.packet[3][1]; + } + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else { + if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else if (sizeof(LhsPacket) == sizeof(std::complex)) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } else { + result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]); + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock& result0) +{ + if (!GEMV_IS_COMPLEX_FLOAT) { + disassembleResults2(c0, result0); + } else { + disassembleResults4(c0, result0); + } +} +#endif + +#define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1) + +#define GEMV_LOADPACKET_COL_COMPLEX(iter) \ + loadLhsPacket(lhs, i + ((iter) * ResPacketSize), j) + +#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); +#else +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + if (sizeof(LhsPacket) == 16) { \ + const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \ + a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); \ + } else { \ + f##iter1 = lhs.template load(i + ((iter2) * ResPacketSize), j); \ + GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \ + } +#endif + +#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + } else { \ + GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + gemv_mult_complex_MMA(f##iter, b, &e0##iter); \ + } else { \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3) \ + } else { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(f##iter2); \ + EIGEN_UNUSED_VARIABLE(f##iter3); + +#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket g[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(g), &a##iter2); \ + gemv_mult_complex_MMA(g[0], b, &e0##iter2); \ + gemv_mult_complex_MMA(g[1], b, &e0##iter3); \ + } else { \ + gemv_mult_complex_MMA(a##iter2, b, &e0##iter2); \ + gemv_mult_complex_MMA(a##iter3, b, &e0##iter3); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \ + } + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \ + disassembleResults(&e0##iter, result0##iter); + +#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \ + c0##iter = PResPacket(result0##iter.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } else { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \ + c0##iter = PResPacket(result0##iter.packet[2]); \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \ + } \ + } + +#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \ + c0##iter2 = PResPacket(result0##iter2.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + c0##iter3 = PResPacket(result0##iter3.packet[0]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } else { \ + c0##iter3 = PResPacket(result0##iter2.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + c0##iter2 = PResPacket(result0##iter3.packet[0]); \ + c0##iter3 = PResPacket(result0##iter3.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } \ + } + +#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_WORK_COL_COMPLEX_MMA(N) \ + } while (++j < jend); \ + if (GEMV_GETN(N) <= 2) { \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT_COMPLEX(iter, N) \ + if (N > iter) { \ + c0##iter = pset_zero(); \ + c1##iter = pset_init(c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + gemv_mult_complex(f##iter, b, c0##iter, c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_STORE_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } + +/** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \ + i += (ResPacketSize * N); + +#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA)) +#define USE_GEMV_COL_COMPLEX_MMA +#endif + +#ifdef USE_GEMV_COL_COMPLEX_MMA +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) +#else +#if defined(USE_GEMV_MMA) && (__GNUC__ > 10) +#define GEMV_PROCESS_COL_COMPLEX(N) \ + if (sizeof(Scalar) != sizeof(LhsPacket)) { \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + } else { \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + } +#else +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) +#endif +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + + typedef alpha_store AlphaData; + AlphaData alpha_data(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; + PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + __vector_pair a0, a1, a2, a3, a4, a5, a6, a7; + PacketBlock result00, result01, result02, result03, result04, result05, result06, result07; + GEMV_UNUSED(8, e0) + GEMV_UNUSED(8, result0) + GEMV_UNUSED(8, a) + GEMV_UNUSED(8, f) +#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA) + if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT) +#endif +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + { + while (i < n8) + { + GEMV_PROCESS_COL_COMPLEX(8) + } + } + while (i < n4) + { + GEMV_PROCESS_COL_COMPLEX(4) + } + if (i < n2) + { + GEMV_PROCESS_COL_COMPLEX(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_COMPLEX_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +template struct ScalarBlock { + Scalar scalar[N]; +}; + +#ifdef USE_GEMV_MMA +static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f }; + +/** \internal predux (add elements of a vector) from a MMA accumulator - real results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]); + result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]); + result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]); + result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3); + result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template<> +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +/** \internal add complex results together */ +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock& result0, PacketBlock& result1) +{ + ScalarBlock, 2> cc0; + result0.packet[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[0]), reinterpret_cast(result1.packet[0]))); + result0.packet[2] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[2]), reinterpret_cast(result1.packet[2]))); + result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]); + if (GEMV_IS_COMPLEX_COMPLEX) { + result0.packet[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[1]), reinterpret_cast(result1.packet[1]))); + result0.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[3]), reinterpret_cast(result1.packet[3]))); + result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]); + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else { + if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex))) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[0][2]); + cc0.scalar[1].imag(result0.packet[0][3]); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock&, PacketBlock&) +{ + ScalarBlock, 2> cc0; + EIGEN_UNUSED_VARIABLE(cc0); + return cc0; // Just for compilation +} + +/** \internal predux (add elements of a vector) from a MMA accumulator - complex results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + return addComplexResults(result0, result1); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0) +{ + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0) +{ + ScalarBlock cc0; + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } else if (ConjugateRhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2)); + result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2)); + } else { + result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1); + result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1); + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[2][0]); + cc0.scalar[1].imag(result0.packet[2][1]); + return cc0; +} +#endif + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(ResPacket& a, ResPacket& b) +{ + ScalarBlock cc0; + cc0.scalar[0] = predux(a); + cc0.scalar[1] = predux(b); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPacket& b) +{ + return predux_real(a, b); +} + +#define GEMV_UNROLL_ROW(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_ROW_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_LOADPACKET_ROW(iter) \ + lhs.template load(i + (iter), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3_ROW(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_ROW(N, which) \ + GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&c##iter); \ + } + +#define GEMV_LOADPAIR_ROW(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1)); + +#define GEMV_WORK_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \ + } else { \ + __vector_pair b##iter; \ + GEMV_LOADPAIR_ROW(iter, iter << 1) \ + pger_vecMMA_acc(&c##iter, b##iter, a0); \ + } \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_FLOAT) { \ + cc##iter1 = predux_real(&c##iter2, &c##iter3); \ + } else { \ + cc##iter1 = predux_real(&c##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#else +#define GEMV_INIT_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#define GEMV_WORK_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_real(c##iter2, c##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#endif + +#define GEMV_MULT(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \ + } + +#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +/** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW(N) \ + for (; i < n##N; i += N) { \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \ + Index j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket a0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \ + for (; j < cols; ++j) { \ + RhsScalar a0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \ + } + +template +EIGEN_STRONG_INLINE void gemv_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; + conj_helper pcj; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + + Index i = 0; +#ifdef USE_GEMV_MMA + __vector_quad c0, c1, c2, c3, c4, c5, c6, c7; + GEMV_UNUSED_ROW(8, c) +#else + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; + GEMV_PROCESS_ROW(8) + GEMV_PROCESS_ROW(4) + GEMV_PROCESS_ROW(2) +#endif + for (; i < rows; ++i) + { + ResPacket d0 = pset1(ResScalar(0)); + Index j = 0; + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) + { + RhsPacket b0 = rhs2.template load(j); + + d0 = pcj.pmadd(lhs.template load(i + 0, j), b0, d0); + } + ResScalar dd0 = predux(d0); + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double) + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + a0 = padd(a0, a1); + b0 = padd(b0, b1); + } + return predux_complex(a0, b0); +} + +#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \ + loadLhsPacket(lhs, i + (iter), j) + +#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter)) + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + const RhsScalar& b1 = rhs2(j); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL_ROW(which, N) \ + } + +#define GEMV_PROCESS_END_ROW_COMPLEX(N) \ + for (; j < cols; ++j) { \ + RhsScalar b0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } else { \ + __vector_pair a##iter; \ + GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + cc##iter1 = predux_complex(&e0##iter2, &e0##iter3); \ + } else { \ + cc##iter1 = predux_complex(&e0##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } +#endif + +#define GEMV_WORK_ROW_COMPLEX(iter, N) \ + if (N > iter) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex(a##iter, b, c0##iter, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_complex(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \ + } + +#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N) + +/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + dd0 = predux(c0##iter); + +#if EIGEN_COMP_LLVM +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) +#else +// gcc seems to be reading and writing registers unnecessarily to memory. +// Use the old way for complex double until it is fixed. + +#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \ + lhs.template load(i + (iter), j) + +#define GEMV_INIT_COMPLEX_OLD(iter, N) \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + if (N > iter) { \ + c1##iter = pset_zero(); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \ + if (N > iter) { \ + LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \ + c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] = predux(c1##iter2); \ + cc##iter1.scalar[1] = predux(c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket b0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + dd0 = predux(c1##iter); + +#if (__GNUC__ > 10) +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1 +#else +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \ + (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX +#endif + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + } +#endif + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) +#else +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE(N) +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; +#if !EIGEN_COMP_LLVM + conj_helper pcj; +#endif + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; + + Index i = 0, j; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + GEMV_UNUSED_ROW(8, e0) + GEMV_UNUSED_EXTRA(1, c0) + GEMV_UNUSED_EXTRA(1, c1) +#endif + ResScalar dd0; +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; +#ifdef USE_GEMV_MMA + if (!GEMV_IS_COMPLEX_COMPLEX) +#endif + { + GEMV_PROCESS_ROW_COMPLEX(8) + } + GEMV_PROCESS_ROW_COMPLEX(4) + GEMV_PROCESS_ROW_COMPLEX(2) +#endif + for (; i < rows; ++i) + { + GEMV_PROCESS_ROW_COMPLEX_SINGLE(1) + GEMV_PROCESS_ROW_COMPLEX_PREDUX(0) + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, std::complex) + +#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 528f995d3..c7587ea77 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -84,7 +84,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} -#ifndef __VSX__ +#ifndef EIGEN_VECTORIZE_VSX static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif @@ -114,7 +114,7 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 // Define global static constants: #ifdef _BIG_ENDIAN static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #endif static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; @@ -168,7 +168,7 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX HasSqrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, @@ -210,7 +210,7 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX HasSqrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, @@ -432,7 +432,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) // ignoring these warnings for now. EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); @@ -481,7 +481,7 @@ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet // ignoring these warnings for now. EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX vec_xst(from, 0, to); #else vec_st(from, 0, to); @@ -816,7 +816,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { -#ifndef __VSX__ // VSX actually provides a div instruction +#ifndef EIGEN_VECTORIZE_VSX // VSX actually provides a div instruction Packet4f t, y_0, y_1; // Altivec does not offer a divide instruction, we have to do a reciprocal approximation @@ -845,7 +845,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8 template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { - #ifdef __VSX__ + #ifdef EIGEN_VECTORIZE_VSX // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); @@ -863,7 +863,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { - #ifdef __VSX__ + #ifdef EIGEN_VECTORIZE_VSX // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); @@ -940,7 +940,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); Packet4f res; -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX __asm__("xvrspiz %x0, %x1\n\t" : "=&wa" (res) : "wa" (t)); @@ -2259,7 +2259,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa //---------- double ---------- -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; @@ -2721,7 +2721,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons } -#endif // __VSX__ +#endif // EIGEN_VECTORIZE_VSX } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 259ef0ca1..87858a27d 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -363,10 +363,10 @@ #endif } // end extern "C" - #elif defined __VSX__ + #elif defined(__VSX__) && !defined(__APPLE__) #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX + #define EIGEN_VECTORIZE_VSX 1 #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 0ffe155ae..0e89f3709 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -314,7 +314,7 @@ #endif /// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC -#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) +#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__) #define EIGEN_ARCH_PPC 1 #else #define EIGEN_ARCH_PPC 0 @@ -1135,11 +1135,16 @@ namespace Eigen { // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, // you will need to apply to the underlying POD type. #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT - // This seems to be broken on clang. Packet4f is loaded into a single - // register rather than a vector, zeroing out some entries. Integer + // This seems to be broken on clang. Packet4f is loaded into a single + // register rather than a vector, zeroing out some entries. Integer // types also generate a compile error. - // General, Altivec, VSX. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #if EIGEN_OS_MAC + // General, Altivec for Apple (VSX were added in ISA v2.06): + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v" (X)); + #else + // General, Altivec, VSX otherwise: + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #endif #elif EIGEN_ARCH_ARM_OR_ARM64 // General, NEON. // Clang doesn't like "r",