diff --git a/CMakeLists.txt b/CMakeLists.txt index 116849c8f..02edc5594 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -486,6 +486,12 @@ if (EIGEN_BUILD_TESTING) message(STATUS "Enabling MSA in tests/examples") endif() + option(EIGEN_TEST_LSX "Enable/Disable LSX in tests/examples" OFF) + if(EIGEN_TEST_LSX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx") + message(STATUS "Enabling LSX in tests/examples") + endif() + option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) if(EIGEN_TEST_FMA) diff --git a/Eigen/Core b/Eigen/Core index f87f3d2f6..c63751757 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -234,6 +234,11 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_LSX +#include "src/Core/arch/LSX/PacketMath.h" +#include "src/Core/arch/LSX/TypeCasting.h" +#include "src/Core/arch/LSX/MathFunctions.h" +#include "src/Core/arch/LSX/Complex.h" #elif defined EIGEN_VECTORIZE_SVE #include "src/Core/arch/SVE/PacketMath.h" #include "src/Core/arch/SVE/TypeCasting.h" @@ -381,6 +386,8 @@ using std::ptrdiff_t; #include "src/Core/arch/AltiVec/MatrixProduct.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" +#elif defined EIGEN_VECTORIZE_LSX +#include "src/Core/arch/LSX/GeneralBlockPanelKernel.h" #endif #if defined(EIGEN_VECTORIZE_AVX512) diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h new file mode 100644 index 000000000..0b60a8312 --- /dev/null +++ b/Eigen/src/Core/arch/LSX/Complex.h @@ -0,0 +1,520 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// copyright (c) 2023 zang ruochen +// copyright (c) 2024 XiWei Gu +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_LSX_H +#define EIGEN_COMPLEX_LSX_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet2cf { + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + Packet4f v; +}; + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasExp = 1, + HasAbs = 0, + HasLog = 1, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet4f as_real; + enum { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { + return Packet2cf(__lsx_vfadd_s(a.v, b.v)); +} +template <> +EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { + return Packet2cf(__lsx_vfsub_s(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { + const uint32_t b[4] = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u}; + Packet4i mask = (Packet4i)__lsx_vld(b, 0); + Packet2cf res; + res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + const uint32_t b[4] = {0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u}; + Packet4i mask = (__m128i)__lsx_vld(b, 0); + Packet2cf res; + res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { + Packet4f part0_tmp = (Packet4f)__lsx_vfmul_s(a.v, b.v); + Packet4f part0 = __lsx_vfsub_s(part0_tmp, (__m128)__lsx_vshuf4i_w(part0_tmp, 0x31)); + Packet4f part1_tmp = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(a.v, 0xb1), b.v); + Packet4f part1 = __lsx_vfadd_s(part1_tmp, (__m128)__lsx_vshuf4i_w(part1_tmp, 0x31)); + Packet2cf res; + res.v = (Packet4f)__lsx_vpackev_w((__m128i)part1, (__m128i)part0); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ptrue(const Packet2cf& a) { + return Packet2cf(ptrue(Packet4f(a.v))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { + Packet2cf res; + res.v = (Packet4f)__lsx_vand_v((__m128i)a.v, (__m128i)b.v); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { + Packet2cf res; + res.v = (Packet4f)__lsx_vor_v((__m128i)a.v, (__m128i)b.v); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { + Packet2cf res; + res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { + Packet2cf res; + res.v = (Packet4f)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { + float f0 = from.real(), f1 = from.imag(); + Packet4f re = {f0, f0, f0, f0}; + Packet4f im = {f1, f1, f1, f1}; + return Packet2cf((Packet4f)__lsx_vilvl_w((__m128i)im, (__m128i)re)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet2cf& from) { + EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet2cf& from) { + EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, + Index stride) { + Packet2cf res; + __m128i tmp = __lsx_vldrepl_d(from, 0); + __m128i tmp1 = __lsx_vldrepl_d(from + stride, 0); + tmp = __lsx_vilvl_d(tmp1, tmp); + res.v = (__m128)tmp; + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, + Index stride) { + __lsx_vstelm_d((__m128i)from.v, to, 0, 0); + __lsx_vstelm_d((__m128i)from.v, to + stride, 0, 1); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { + EIGEN_ALIGN16 std::complex res[2]; + __lsx_vst(a.v, res, 0); + return res[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { + Packet2cf res; + res.v = (Packet4f)__lsx_vshuf4i_w(a.v, 0x4e); + return res; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + return pfirst(Packet2cf(__lsx_vfadd_s(a.v, vec4f_movehl(a.v, a.v)))); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { + return pfirst(pmul(a, Packet2cf(vec4f_movehl(a.v, a.v)))); +} + +EIGEN_STRONG_INLINE Packet2cf pcplxflip /* */ (const Packet2cf& x) { + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) + +template <> +EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { + return pdiv_complex(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf plog(const Packet2cf& a) { + return plog_complex(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) { + __m128 v = {0.0f, 0.0f, 0.0f, 0.0f}; + return (Packet2cf)v; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) { + Packet2cf result, t0, t1, t2; + t1 = pzero(t1); + t0.v = (__m128)__lsx_vpackev_w((__m128i)a.v, (__m128i)a.v); + t2.v = __lsx_vfmadd_s(t0.v, b.v, c.v); + result.v = __lsx_vfadd_s(t2.v, t1.v); + t1.v = __lsx_vfsub_s(t1.v, a.v); + t1.v = (__m128)__lsx_vpackod_w((__m128i)a.v, (__m128i)t1.v); + t2.v = (__m128)__lsx_vshuf4i_w((__m128i)b.v, 0xb1); + result.v = __lsx_vfmadd_s(t1.v, t2.v, result.v); + return result; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pexp(const Packet2cf& a) { + return pexp_complex(a); +} + +//---------- double ---------- +struct Packet1cd { + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + Packet2d v; +}; + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasAbs = 0, + HasLog = 1, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum { + size = 1, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { + return Packet1cd(__lsx_vfadd_d(a.v, b.v)); +} +template <> +EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { + return Packet1cd(__lsx_vfsub_d(a.v, b.v)); +} +template <> +EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { + return Packet1cd(pnegate(Packet2d(a.v))); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { + const uint64_t tmp[2] = {0x0000000000000000u, 0x8000000000000000u}; + __m128i mask = __lsx_vld(tmp, 0); + Packet1cd res; + res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, mask); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { + Packet2d tmp_real = __lsx_vfmul_d(a.v, b.v); + Packet2d real = __lsx_vfsub_d(tmp_real, preverse(tmp_real)); + + Packet2d tmp_imag = __lsx_vfmul_d(preverse(a.v), b.v); + Packet2d imag = (__m128d)__lsx_vfadd_d((__m128d)tmp_imag, preverse(tmp_imag)); + Packet1cd res; + res.v = (__m128d)__lsx_vilvl_d((__m128i)imag, (__m128i)real); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ptrue(const Packet1cd& a) { + return Packet1cd(ptrue(Packet2d(a.v))); +} +template <> +EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { + Packet1cd res; + res.v = (Packet2d)__lsx_vand_v((__m128i)a.v, (__m128i)b.v); + return res; +} +template <> +EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { + Packet1cd res; + res.v = (Packet2d)__lsx_vor_v((__m128i)a.v, (__m128i)b.v); + return res; +} +template <> +EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { + Packet1cd res; + res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v); + return res; +} +template <> +EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { + Packet1cd res; + res.v = (Packet2d)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v); + return res; +} + +// FIXME force unaligned load, this is a temporary fix +template <> +EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); +} +template <> +EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); +} +template <> +EIGEN_STRONG_INLINE Packet1cd +pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ + return ploadu(&from); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { + return pset1(*from); +} + +// FIXME force unaligned store, this is a temporary fix +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cd& from) { + EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); +} +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet1cd& from) { + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { + EIGEN_ALIGN16 double res[2]; + __lsx_vst(a.v, res, 0); + return std::complex(res[0], res[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { + return pfirst(a); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { + return pfirst(a); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) + +template <> +EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { + return pdiv_complex(a, b); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip /* */ (const Packet1cd& x) { + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Packet4f tmp1 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v); + Packet4f tmp2 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v); + kernel.packet[0].v = (Packet4f)__lsx_vshuf4i_w(tmp1, 0xd8); + kernel.packet[1].v = (Packet4f)__lsx_vshuf4i_w(tmp2, 0xd8); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = (Packet4f)__lsx_vfcmp_ceq_s(a.v, b.v); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { + Packet2d eq = (Packet2d)__lsx_vfcmp_ceq_d(a.v, b.v); + return Packet1cd(pand(eq, preverse(eq))); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pselect(const Packet2cf& mask, const Packet2cf& a, const Packet2cf& b) { + Packet2cf res; + res.v = (Packet4f)__lsx_vbitsel_v((__m128i)b.v, (__m128i)a.v, (__m128i)mask.v); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd plog(const Packet1cd& a) { + return plog_complex(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pzero(const Packet1cd& /* a */) { + __m128d v = {0.0, 0.0}; + return (Packet1cd)v; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) { + Packet1cd result, t0, t1, t2; + t1 = pzero(t1); + t0.v = (__m128d)__lsx_vpackev_d((__m128i)a.v, (__m128i)a.v); + t2.v = __lsx_vfmadd_d(t0.v, b.v, c.v); + result.v = __lsx_vfadd_d(t2.v, t1.v); + t1.v = __lsx_vfsub_d(t1.v, a.v); + t1.v = (__m128d)__lsx_vpackod_d((__m128i)a.v, (__m128i)t1.v); + t2.v = (__m128d)__lsx_vshuf4i_d((__m128i)t2.v, (__m128i)b.v, 0xb); + result.v = __lsx_vfmadd_d(t1.v, t2.v, result.v); + return result; +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, + Index /* stride */) { + Packet1cd res; + __m128i tmp = __lsx_vld((void*)from, 0); + res.v = (__m128d)tmp; + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, + Index /* stride */) { + __lsx_vst((__m128i)from.v, (void*)to, 0); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + Packet2d tmp = (__m128d)__lsx_vilvl_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v); + kernel.packet[1].v = (__m128d)__lsx_vilvh_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v); + kernel.packet[0].v = tmp; +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_LSX_H diff --git a/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..4b0706203 --- /dev/null +++ b/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h @@ -0,0 +1,23 @@ +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +#ifndef EIGEN_LSX_GEBP_NR +#define EIGEN_LSX_GEBP_NR 8 +#endif + +template <> +struct gebp_traits + : gebp_traits { + enum { nr = EIGEN_LSX_GEBP_NR }; +}; + +template <> +struct gebp_traits + : gebp_traits { + enum { nr = EIGEN_LSX_GEBP_NR }; +}; +} // namespace internal +} // namespace Eigen diff --git a/Eigen/src/Core/arch/LSX/MathFunctions.h b/Eigen/src/Core/arch/LSX/MathFunctions.h new file mode 100644 index 000000000..cead46368 --- /dev/null +++ b/Eigen/src/Core/arch/LSX/MathFunctions.h @@ -0,0 +1,43 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 XiWei Gu (guxiwei-hf@loongson.cn) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_LSX_H +#define EIGEN_MATH_FUNCTIONS_LSX_H + +/* The sin and cos functions of this file are loosely derived from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d) +EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d) +EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d) +EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d) + +EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f) +EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f) +EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f) +EIGEN_FLOAT_PACKET_FUNCTION(tanh, Packet4f) + +EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d) +EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f) +EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d) +EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f) + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_LSX_H diff --git a/Eigen/src/Core/arch/LSX/PacketMath.h b/Eigen/src/Core/arch/LSX/PacketMath.h new file mode 100644 index 000000000..87232aa29 --- /dev/null +++ b/Eigen/src/Core/arch/LSX/PacketMath.h @@ -0,0 +1,2866 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 Zang Ruochen +// Copyright (C) 2024 XiWei Gu +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_LSX_H +#define EIGEN_PACKET_MATH_LSX_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if EIGEN_ARCH_LOONGARCH64 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +typedef __m128 Packet4f; +typedef __m128d Packet2d; + +typedef eigen_packet_wrapper<__m128i, 0> Packet16c; +typedef eigen_packet_wrapper<__m128i, 1> Packet8s; +typedef eigen_packet_wrapper<__m128i, 2> Packet4i; +typedef eigen_packet_wrapper<__m128i, 3> Packet2l; +typedef eigen_packet_wrapper<__m128i, 4> Packet16uc; +typedef eigen_packet_wrapper<__m128i, 5> Packet8us; +typedef eigen_packet_wrapper<__m128i, 6> Packet4ui; +typedef eigen_packet_wrapper<__m128i, 7> Packet2ul; + +template <> +struct is_arithmetic<__m128> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m128i> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m128d> { + enum { value = true }; +}; +template <> +struct is_arithmetic { + enum { value = true }; +}; +template <> +struct is_arithmetic { + enum { value = true }; +}; +template <> +struct is_arithmetic { + enum { value = true }; +}; +template <> +struct is_arithmetic { + enum { value = true }; +}; +template <> +struct is_arithmetic { + enum { value = false }; +}; +template <> +struct is_arithmetic { + enum { value = false }; +}; +template <> +struct is_arithmetic { + enum { value = false }; +}; +template <> +struct is_arithmetic { + enum { value = false }; +}; + +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return (Packet4f)__lsx_vld(from, 0); +} + +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) { + const float* a = reinterpret_cast(&m); + Packet4f res = + make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3))); + return res; +} + +template +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) { + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = + make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) { + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = + make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); + return res; +} + +EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) { + return ((s) << 6 | (r) << 4 | (q) << 2 | (p)); +} + +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) { + return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) { + return shuffle2(a, b, eigen_lsx_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) { + return shuffle2(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) { + return shuffle2(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) { + return shuffle2(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) { + return shuffle2(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3)); +} + +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return (Packet2d)__lsx_vld(from, 0); +} + +EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) { + const double* a = reinterpret_cast(&m); + const double* b = reinterpret_cast(&n); + Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); + return res; +} + +EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) { + return shuffle(a, b, mask); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); } +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); } + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16c type; + typedef Packet16c half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + + HasAbs2 = 0, + HasSetLinear = 0, + HasCmp = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8s type; + typedef Packet8s half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + + HasAbs2 = 0, + HasSetLinear = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + + HasAbs2 = 0, + HasSetLinear = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2l type; + typedef Packet2l half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasAbs2 = 0, + HasSetLinear = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16uc type; + typedef Packet16uc half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + + HasAbs2 = 0, + HasSetLinear = 0, + HasNegate = 0, + HasCmp = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8us type; + typedef Packet8us half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + + HasAbs2 = 0, + HasSetLinear = 0, + HasNegate = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4ui type; + typedef Packet4ui half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + + HasAbs2 = 0, + HasSetLinear = 0, + HasNegate = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2ul type; + typedef Packet2ul half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasAbs2 = 0, + HasSetLinear = 0, + HasNegate = 0, + HasCmp = 1, + HasDiv = 1, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + + HasAbs2 = 0, + HasSetLinear = 0, + HasBlend = 0, + HasSign = 0, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasLog = 1, + HasRsqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasAbs2 = 0, + HasSetLinear = 0, + HasBlend = 0, + HasSign = 0, + HasDiv = 1, + HasSqrt = 1, + HasLog = 1, + HasRsqrt = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef int8_t type; + typedef Packet16c half; + enum { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef int16_t type; + typedef Packet8s half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef int32_t type; + typedef Packet4i half; + enum { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef int64_t type; + typedef Packet2l half; + enum { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef uint8_t type; + typedef Packet16uc half; + enum { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef uint16_t type; + typedef Packet8us half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef uint32_t type; + typedef Packet4ui half; + enum { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef uint64_t type; + typedef Packet2ul half; + enum { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef double type; + typedef Packet2d half; + typedef Packet2l integer_packet; + enum { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { + return __lsx_vreplgr2vr_b(from); +} +template <> +EIGEN_STRONG_INLINE Packet8s pset1(const int16_t& from) { + return __lsx_vreplgr2vr_h(from); +} +template <> +EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { + return __lsx_vreplgr2vr_w(from); +} +template <> +EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { + return __lsx_vreplgr2vr_d(from); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { + return __lsx_vreplgr2vr_b(from); +} +template <> +EIGEN_STRONG_INLINE Packet8us pset1(const uint16_t& from) { + return __lsx_vreplgr2vr_h(from); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) { + return __lsx_vreplgr2vr_w(from); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { + return __lsx_vreplgr2vr_d(from); +} +template <> +EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + Packet4f v = {from, from, from, from}; + return v; +} +template <> +EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + Packet2d v = {from, from}; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pset1frombits(uint32_t from) { + return reinterpret_cast<__m128>((__m128i)pset1(from)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { + return reinterpret_cast<__m128d>((__m128i)pset1(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) { + const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + return __lsx_vadd_b(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet8s plset(const int16_t& a) { + const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7}; + return __lsx_vadd_h(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { + const int32_t countdown[] = {0, 1, 2, 3}; + return __lsx_vadd_w(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) { + const int64_t countdown[] = {0, 1}; + return __lsx_vadd_d(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) { + const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + return __lsx_vadd_b(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet8us plset(const uint16_t& a) { + const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7}; + return __lsx_vadd_h(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui plset(const uint32_t& a) { + const uint32_t countdown[] = {0, 1, 2, 3}; + return __lsx_vadd_w(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul plset(const uint64_t& a) { + const uint64_t countdown[] = {0, 1}; + return __lsx_vadd_d(pset1(a), __lsx_vld(countdown, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet4f plset(const float& a) { + static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f}; + return __lsx_vfadd_s(pset1(a), countdown); +} +template <> +EIGEN_STRONG_INLINE Packet2d plset(const double& a) { + static const Packet2d countdown = {0.0f, 1.0f}; + return __lsx_vfadd_d(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet16c padd(const Packet16c& a, const Packet16c& b) { + return __lsx_vadd_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { + return __lsx_vadd_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { + return __lsx_vadd_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { + return __lsx_vadd_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vadd_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us padd(const Packet8us& a, const Packet8us& b) { + return __lsx_vadd_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui padd(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vadd_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul padd(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vadd_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { + return __lsx_vfadd_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { + return __lsx_vfadd_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16c psub(const Packet16c& a, const Packet16c& b) { + return __lsx_vsub_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { + return __lsx_vsub_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { + return __lsx_vsub_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { + return __lsx_vsub_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vsub_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us psub(const Packet8us& a, const Packet8us& b) { + return __lsx_vsub_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vsub_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vsub_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { + return __lsx_vfsub_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { + return __lsx_vfsub_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template <> +EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { + const Packet4f mask = + make_packet4f(numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f); + return padd(a, pxor(mask, b)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b); +template <> +EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b) { + const Packet2d mask = make_packet2d(numext::bit_cast(0x8000000000000000ull), 0.0); + return padd(a, pxor(mask, b)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { + Packet4f mask = make_packet4f(numext::bit_cast(0x80000000), numext::bit_cast(0x80000000), + numext::bit_cast(0x80000000), numext::bit_cast(0x80000000)); + return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { + Packet2d mask = + make_packet2d(numext::bit_cast(0x8000000000000000), numext::bit_cast(0x8000000000000000)); + return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { + return __lsx_vneg_b(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { + return __lsx_vneg_h(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { + return __lsx_vneg_w(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { + return __lsx_vneg_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { + return __lsx_vfmul_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { + return __lsx_vfmul_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pmul(const Packet16c& a, const Packet16c& b) { + return __lsx_vmul_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { + return __lsx_vmul_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { + return __lsx_vmul_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pmul(const Packet2l& a, const Packet2l& b) { + return __lsx_vmul_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vmul_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pmul(const Packet8us& a, const Packet8us& b) { + return __lsx_vmul_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pmul(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vmul_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pmul(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vmul_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + return __lsx_vfdiv_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { + return __lsx_vfdiv_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pdiv(const Packet8s& a, const Packet8s& b) { + return __lsx_vdiv_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { + return __lsx_vdiv_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pdiv(const Packet2l& a, const Packet2l& b) { + return __lsx_vdiv_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pdiv(const Packet8us& a, const Packet8us& b) { + return __lsx_vdiv_hu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pdiv(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vdiv_wu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vdiv_du(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return __lsx_vfmadd_s(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + return __lsx_vfmadd_d(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return __lsx_vfmsub_s(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + return __lsx_vfmsub_d(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return __lsx_vfnmsub_s(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + return __lsx_vfnmsub_d(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return __lsx_vfnmadd_s(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + return __lsx_vfnmadd_d(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) { + return __lsx_vmadd_b(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { + return __lsx_vmadd_h(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + return __lsx_vmadd_w(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) { + return __lsx_vmadd_d(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) { + return __lsx_vmadd_b(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { + return __lsx_vmadd_h(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { + return __lsx_vmadd_w(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) { + return __lsx_vmadd_d(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pand(const Packet16c& a, const Packet16c& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pand(const Packet8s& a, const Packet8s& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pand(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vand_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pand(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vand_v(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet16c por(const Packet16c& a, const Packet16c& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc por(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui por(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul por(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vor_v(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pxor(const Packet16c& a, const Packet16c& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pxor(const Packet8s& a, const Packet8s& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pxor(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pxor(const Packet8us& a, const Packet8us& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pxor(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vxor_v(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pxor(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vxor_v(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet16c pandnot(const Packet16c& a, const Packet16c& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet8s pandnot(const Packet8s& a, const Packet8s& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pandnot(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet8us pandnot(const Packet8us& a, const Packet8us& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pandnot(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vandn_v(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vandn_v(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vfcmp_cle_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vfcmp_cle_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { + return __lsx_vsle_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { + return __lsx_vsle_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { + return __lsx_vsle_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) { + return __lsx_vsle_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vsle_bu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { + return __lsx_vsle_hu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vsle_wu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcmp_le(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vsle_du(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vfcmp_clt_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vfcmp_clt_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { + return __lsx_vslt_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { + return __lsx_vslt_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { + return __lsx_vslt_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) { + return __lsx_vslt_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vslt_bu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { + return __lsx_vslt_hu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vslt_wu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcmp_lt(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vslt_du(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vfcmp_sult_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vfcmp_sult_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vfcmp_seq_s(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vfcmp_seq_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { + return __lsx_vseq_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { + return __lsx_vseq_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { + return __lsx_vseq_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) { + return __lsx_vseq_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vseq_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { + return __lsx_vseq_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vseq_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcmp_eq(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vseq_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { + return __lsx_vmin_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { + return __lsx_vmin_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { + return __lsx_vmin_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { + return __lsx_vmin_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vmin_bu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { + return __lsx_vmin_hu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pmin(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vmin_wu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pmin(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vmin_du(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { + return __lsx_vmax_b(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { + return __lsx_vmax_h(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { + return __lsx_vmax_w(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { + return __lsx_vmax_d(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vmax_bu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { + return __lsx_vmax_hu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pmax(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vmax_wu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pmax(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vmax_du(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + Packet4i aNaN = __lsx_vfcmp_cun_s(a, a); + Packet4i aMinOrNaN = por(__lsx_vfcmp_clt_s(a, b), aNaN); + return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + Packet2l aNaN = __lsx_vfcmp_cun_d(a, a); + Packet2l aMinOrNaN = por(__lsx_vfcmp_clt_d(a, b), aNaN); + return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN); +} +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + Packet4i aNaN = __lsx_vfcmp_cun_s(a, a); + Packet4i aMaxOrNaN = por(__lsx_vfcmp_clt_s(b, a), aNaN); + return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + Packet2l aNaN = __lsx_vfcmp_cun_d(a, a); + Packet2l aMaxOrNaN = por(__lsx_vfcmp_clt_d(b, a), aNaN); + return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN); +} + +template +EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) { + return __lsx_vsrai_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) { + return __lsx_vsrai_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { + return __lsx_vsrai_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) { + return __lsx_vsrai_d((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) { + return __lsx_vsrli_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) { + return __lsx_vsrli_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { + return __lsx_vsrli_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) { + return __lsx_vsrli_d((__m128i)a, N); +} + +template +EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) { + return __lsx_vsrli_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) { + return __lsx_vsrli_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) { + return __lsx_vsrli_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + return __lsx_vsrli_d((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) { + return __lsx_vsrli_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) { + return __lsx_vsrli_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) { + return __lsx_vsrli_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) { + return __lsx_vsrli_d((__m128i)a, N); +} + +template +EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) { + return __lsx_vslli_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) { + return __lsx_vslli_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) { + return __lsx_vslli_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + return __lsx_vslli_d((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) { + return __lsx_vslli_b((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) { + return __lsx_vslli_h((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) { + return __lsx_vslli_w((__m128i)a, N); +} +template +EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) { + return __lsx_vslli_d((__m128i)a, N); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { + return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31); +} +template <> +EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { + return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63); +} +template <> +EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { + return __lsx_vabsd_b(a, pzero(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { + return __lsx_vabsd_h(a, pzero(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { + return __lsx_vabsd_w(a, pzero(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { + return __lsx_vabsd_d(a, pzero(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s ploadu(const int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us ploadu(const uint16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui ploadu(const uint32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2ul ploadu(const uint64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { + float f0 = from[0], f1 = from[1]; + return make_packet4f(f0, f0, f1, f1); +} +template <> +EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { + return pset1(from[0]); +} +template <> +EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) { + Packet16c tmp = pload(from); + return __lsx_vilvl_b(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet8s ploaddup(const int16_t* from) { + Packet8s tmp = pload(from); + return __lsx_vilvl_h(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { + Packet4i tmp = pload(from); + return __lsx_vilvl_w(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) { + return pset1(from[0]); +} +template <> +EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) { + Packet16uc tmp = pload(from); + return __lsx_vilvl_b(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet8us ploaddup(const uint16_t* from) { + Packet8us tmp = pload(from); + return __lsx_vilvl_h(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet4ui ploaddup(const uint32_t* from) { + Packet4ui tmp = pload(from); + return __lsx_vilvl_w(tmp, tmp); +} +template <> +EIGEN_STRONG_INLINE Packet2ul ploaddup(const uint64_t* from) { + return pset1(from[0]); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) { + EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet8s& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet8us& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet4ui& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(uint64_t* to, const Packet2ul& from) { + EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather(const float* from, Index stride) { + Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]}; + return v; +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather(const double* from, Index stride) { + Packet2d v = {from[0], from[stride]}; + return v; +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather(const int8_t* from, Index stride) { + int8_t v[16] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + v[4] = from[4 * stride]; + v[5] = from[5 * stride]; + v[6] = from[6 * stride]; + v[7] = from[7 * stride]; + v[8] = from[8 * stride]; + v[9] = from[9 * stride]; + v[10] = from[10 * stride]; + v[11] = from[11 * stride]; + v[12] = from[12 * stride]; + v[13] = from[13 * stride]; + v[14] = from[14 * stride]; + v[15] = from[15 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather(const int16_t* from, Index stride) { + int16_t v[8] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + v[4] = from[4 * stride]; + v[5] = from[5 * stride]; + v[6] = from[6 * stride]; + v[7] = from[7 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather(const int32_t* from, Index stride) { + int32_t v[4] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather(const int64_t* from, Index stride) { + int64_t v[2] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather(const uint8_t* from, Index stride) { + uint8_t v[16] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + v[4] = from[4 * stride]; + v[5] = from[5 * stride]; + v[6] = from[6 * stride]; + v[7] = from[7 * stride]; + v[8] = from[8 * stride]; + v[9] = from[9 * stride]; + v[10] = from[10 * stride]; + v[11] = from[11 * stride]; + v[12] = from[12 * stride]; + v[13] = from[13 * stride]; + v[14] = from[14 * stride]; + v[15] = from[15 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather(const uint16_t* from, Index stride) { + uint16_t v[8] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + v[4] = from[4 * stride]; + v[5] = from[5 * stride]; + v[6] = from[6 * stride]; + v[7] = from[7 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather(const uint32_t* from, Index stride) { + uint32_t v[4] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return __lsx_vld(v, 0); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather(const uint64_t* from, Index stride) { + uint64_t v[2] __attribute__((aligned(16))); + v[0] = from[0]; + v[1] = from[stride]; + return __lsx_vld(v, 0); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet4f& from, Index stride) { + __lsx_vstelm_w(from, to, 0, 0); + __lsx_vstelm_w(from, to + stride * 1, 0, 1); + __lsx_vstelm_w(from, to + stride * 2, 0, 2); + __lsx_vstelm_w(from, to + stride * 3, 0, 3); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(double* to, const Packet2d& from, Index stride) { + __lsx_vstelm_d(from, to, 0, 0); + __lsx_vstelm_d(from, to + stride, 0, 1); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet16c& from, + Index stride) { + __lsx_vstelm_b((__m128i)from, to, 0, 0); + __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3); + __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4); + __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5); + __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6); + __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7); + __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8); + __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9); + __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10); + __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11); + __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12); + __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13); + __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14); + __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet8s& from, + Index stride) { + __lsx_vstelm_h((__m128i)from, to, 0, 0); + __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3); + __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4); + __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5); + __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6); + __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet4i& from, + Index stride) { + __lsx_vstelm_w((__m128i)from, to, 0, 0); + __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int64_t* to, const Packet2l& from, + Index stride) { + __lsx_vstelm_d((__m128i)from, to, 0, 0); + __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet16uc& from, + Index stride) { + __lsx_vstelm_b((__m128i)from, to, 0, 0); + __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3); + __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4); + __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5); + __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6); + __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7); + __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8); + __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9); + __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10); + __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11); + __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12); + __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13); + __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14); + __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet8us& from, + Index stride) { + __lsx_vstelm_h((__m128i)from, to, 0, 0); + __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3); + __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4); + __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5); + __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6); + __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet4ui& from, + Index stride) { + __lsx_vstelm_w((__m128i)from, to, 0, 0); + __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1); + __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2); + __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint64_t* to, const Packet2ul& from, + Index stride) { + __lsx_vstelm_d((__m128i)from, to, 0, 0); + __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const float* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const double* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const int8_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const int16_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const uint8_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const uint16_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { + __builtin_prefetch(addr); +} +template <> +EIGEN_STRONG_INLINE void prefetch(const uint64_t* addr) { + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + float v; + __lsx_vstelm_w(a, &v, 0, 0); + return v; +} +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + double v; + __lsx_vstelm_d(a, &v, 0, 0); + return v; +} + +template <> +EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { + return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE int16_t pfirst(const Packet8s& a) { + return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { + return __lsx_vpickve2gr_w((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { + return __lsx_vpickve2gr_d((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { + return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE uint16_t pfirst(const Packet8us& a) { + return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { + return __lsx_vpickve2gr_wu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE uint64_t pfirst(const Packet2ul& a) { + return __lsx_vpickve2gr_du((__m128i)a, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + return (Packet4f)__lsx_vshuf4i_w(a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1); +} +template <> +EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) { + return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) { + return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + return __lsx_vshuf4i_w((__m128i)a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) { + return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) { + return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) { + return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { + return __lsx_vshuf4i_w((__m128i)a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) { + return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3)); + return pfirst(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1))); +} +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + return pfirst(__lsx_vfadd_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) { + Packet8s tmp1 = __lsx_vhaddw_h_b(a, a); + Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1); + Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2); + return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0); +} +template <> +EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) { + Packet4i tmp1 = __lsx_vhaddw_w_h(a, a); + Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1); + return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0); +} +template <> +EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { + Packet2l tmp = __lsx_vhaddw_d_w(a, a); + return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0); +} +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { + return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0); +} +template <> +EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a); + Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1); + Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2); + return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0); +} +template <> +EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) { + Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a); + Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1); + return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0); +} +template <> +EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { + Packet2ul tmp = __lsx_vhaddw_du_wu(a, a); + return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0); +} +template <> +EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) { + return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3)); + return pfirst(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1))); +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + return pfirst(__lsx_vfmul_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) { + Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a)); + Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1)); + Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2)); + return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0); +} +template <> +EIGEN_STRONG_INLINE int16_t predux_mul(const Packet8s& a) { + Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a)); + Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1)); + return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0); +} +template <> +EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { + Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a)); + return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0); +} +template <> +EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { + return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0); +} +template <> +EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a)); + Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1)); + Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2)); + return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0); +} +template <> +EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a) { + Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a)); + Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1)); + return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0); +} +template <> +EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) { + Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a)); + return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0); +} +template <> +EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) { + return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E)); + return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + return pfirst(__lsx_vfmin_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) { + Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); + return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) { + Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { + Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) { + return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) { + Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); + return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) { + Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) { + Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) { + return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a))); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E)); + return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + return pfirst(__lsx_vfmax_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) { + Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); + return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) { + Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { + Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) { + return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a))); +} +template <> +EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) { + Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); + return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) { + Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); + return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) { + Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); + return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); +} +template <> +EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) { + return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + return __lsx_vfsqrt_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { + return __lsx_vfsqrt_d(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); + Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); + Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]); + Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]); + + kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0); + kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0); + kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1); + kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); + kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); + kernel.packet[1] = tmp; +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); + __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]); + __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]); + __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]); + __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]); + __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]); + __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]); + __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]); + __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]); + + __m128i s0 = __lsx_vilvl_h(t2, t0); + __m128i s1 = __lsx_vilvh_h(t2, t0); + __m128i s2 = __lsx_vilvl_h(t3, t1); + __m128i s3 = __lsx_vilvh_h(t3, t1); + __m128i s4 = __lsx_vilvl_h(t6, t4); + __m128i s5 = __lsx_vilvh_h(t6, t4); + __m128i s6 = __lsx_vilvl_h(t7, t5); + __m128i s7 = __lsx_vilvh_h(t7, t5); + __m128i s8 = __lsx_vilvl_h(ta, t8); + __m128i s9 = __lsx_vilvh_h(ta, t8); + __m128i sa = __lsx_vilvl_h(tb, t9); + __m128i sb = __lsx_vilvh_h(tb, t9); + __m128i sc = __lsx_vilvl_h(te, tc); + __m128i sd = __lsx_vilvh_h(te, tc); + __m128i se = __lsx_vilvl_h(tf, td); + __m128i sf = __lsx_vilvh_h(tf, td); + + __m128i u0 = __lsx_vilvl_w(s4, s0); + __m128i u1 = __lsx_vilvh_w(s4, s0); + __m128i u2 = __lsx_vilvl_w(s5, s1); + __m128i u3 = __lsx_vilvh_w(s5, s1); + __m128i u4 = __lsx_vilvl_w(s6, s2); + __m128i u5 = __lsx_vilvh_w(s6, s2); + __m128i u6 = __lsx_vilvl_w(s7, s3); + __m128i u7 = __lsx_vilvh_w(s7, s3); + __m128i u8 = __lsx_vilvl_w(sc, s8); + __m128i u9 = __lsx_vilvh_w(sc, s8); + __m128i ua = __lsx_vilvl_w(sd, s9); + __m128i ub = __lsx_vilvh_w(sd, s9); + __m128i uc = __lsx_vilvl_w(se, sa); + __m128i ud = __lsx_vilvh_w(se, sa); + __m128i ue = __lsx_vilvl_w(sf, sb); + __m128i uf = __lsx_vilvh_w(sf, sb); + + kernel.packet[0] = __lsx_vilvl_d(u8, u0); + kernel.packet[1] = __lsx_vilvh_d(u8, u0); + kernel.packet[2] = __lsx_vilvl_d(u9, u1); + kernel.packet[3] = __lsx_vilvh_d(u9, u1); + kernel.packet[4] = __lsx_vilvl_d(ua, u2); + kernel.packet[5] = __lsx_vilvh_d(ua, u2); + kernel.packet[6] = __lsx_vilvl_d(ub, u3); + kernel.packet[7] = __lsx_vilvh_d(ub, u3); + kernel.packet[8] = __lsx_vilvl_d(uc, u4); + kernel.packet[9] = __lsx_vilvh_d(uc, u4); + kernel.packet[10] = __lsx_vilvl_d(ud, u5); + kernel.packet[11] = __lsx_vilvh_d(ud, u5); + kernel.packet[12] = __lsx_vilvl_d(ue, u6); + kernel.packet[13] = __lsx_vilvh_d(ue, u6); + kernel.packet[14] = __lsx_vilvl_d(uf, u7); + kernel.packet[15] = __lsx_vilvh_d(uf, u7); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); + + __m128i s0 = __lsx_vilvl_h(t2, t0); + __m128i s1 = __lsx_vilvh_h(t2, t0); + __m128i s2 = __lsx_vilvl_h(t3, t1); + __m128i s3 = __lsx_vilvh_h(t3, t1); + __m128i s4 = __lsx_vilvl_h(t6, t4); + __m128i s5 = __lsx_vilvh_h(t6, t4); + __m128i s6 = __lsx_vilvl_h(t7, t5); + __m128i s7 = __lsx_vilvh_h(t7, t5); + + kernel.packet[0] = __lsx_vilvl_w(s4, s0); + kernel.packet[1] = __lsx_vilvh_w(s4, s0); + kernel.packet[2] = __lsx_vilvl_w(s5, s1); + kernel.packet[3] = __lsx_vilvh_w(s5, s1); + kernel.packet[4] = __lsx_vilvl_w(s6, s2); + kernel.packet[5] = __lsx_vilvh_w(s6, s2); + kernel.packet[6] = __lsx_vilvl_w(s7, s3); + kernel.packet[7] = __lsx_vilvh_w(s7, s3); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_h(t2, t0); + kernel.packet[1] = __lsx_vilvh_h(t2, t0); + kernel.packet[2] = __lsx_vilvl_h(t3, t1); + kernel.packet[3] = __lsx_vilvh_h(t3, t1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]); + + __m128i s0 = __lsx_vilvl_w(t2, t0); + __m128i s1 = __lsx_vilvh_w(t2, t0); + __m128i s2 = __lsx_vilvl_w(t3, t1); + __m128i s3 = __lsx_vilvh_w(t3, t1); + __m128i s4 = __lsx_vilvl_w(t6, t4); + __m128i s5 = __lsx_vilvh_w(t6, t4); + __m128i s6 = __lsx_vilvl_w(t7, t5); + __m128i s7 = __lsx_vilvh_w(t7, t5); + + kernel.packet[0] = __lsx_vilvl_d(s4, s0); + kernel.packet[1] = __lsx_vilvh_d(s4, s0); + kernel.packet[2] = __lsx_vilvl_d(s5, s1); + kernel.packet[3] = __lsx_vilvh_d(s5, s1); + kernel.packet[4] = __lsx_vilvl_d(s6, s2); + kernel.packet[5] = __lsx_vilvh_d(s6, s2); + kernel.packet[6] = __lsx_vilvl_d(s7, s3); + kernel.packet[7] = __lsx_vilvh_d(s7, s3); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_w(t2, t0); + kernel.packet[1] = __lsx_vilvh_w(t2, t0); + kernel.packet[2] = __lsx_vilvl_w(t3, t1); + kernel.packet[3] = __lsx_vilvh_w(t3, t1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]); + __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]); + __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]); + __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_d(T2, T0); + kernel.packet[1] = __lsx_vilvh_d(T2, T0); + kernel.packet[2] = __lsx_vilvl_d(T3, T1); + kernel.packet[3] = __lsx_vilvh_d(T3, T1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]); + kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]); + kernel.packet[1] = tmp; +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); + __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]); + __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]); + __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]); + __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]); + __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]); + __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]); + __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]); + __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]); + + __m128i s0 = __lsx_vilvl_h(t2, t0); + __m128i s1 = __lsx_vilvh_h(t2, t0); + __m128i s2 = __lsx_vilvl_h(t3, t1); + __m128i s3 = __lsx_vilvh_h(t3, t1); + __m128i s4 = __lsx_vilvl_h(t6, t4); + __m128i s5 = __lsx_vilvh_h(t6, t4); + __m128i s6 = __lsx_vilvl_h(t7, t5); + __m128i s7 = __lsx_vilvh_h(t7, t5); + __m128i s8 = __lsx_vilvl_h(ta, t8); + __m128i s9 = __lsx_vilvh_h(ta, t8); + __m128i sa = __lsx_vilvl_h(tb, t9); + __m128i sb = __lsx_vilvh_h(tb, t9); + __m128i sc = __lsx_vilvl_h(te, tc); + __m128i sd = __lsx_vilvh_h(te, tc); + __m128i se = __lsx_vilvl_h(tf, td); + __m128i sf = __lsx_vilvh_h(tf, td); + + __m128i u0 = __lsx_vilvl_w(s4, s0); + __m128i u1 = __lsx_vilvh_w(s4, s0); + __m128i u2 = __lsx_vilvl_w(s5, s1); + __m128i u3 = __lsx_vilvh_w(s5, s1); + __m128i u4 = __lsx_vilvl_w(s6, s2); + __m128i u5 = __lsx_vilvh_w(s6, s2); + __m128i u6 = __lsx_vilvl_w(s7, s3); + __m128i u7 = __lsx_vilvh_w(s7, s3); + __m128i u8 = __lsx_vilvl_w(sc, s8); + __m128i u9 = __lsx_vilvh_w(sc, s8); + __m128i ua = __lsx_vilvl_w(sd, s9); + __m128i ub = __lsx_vilvh_w(sd, s9); + __m128i uc = __lsx_vilvl_w(se, sa); + __m128i ud = __lsx_vilvh_w(se, sa); + __m128i ue = __lsx_vilvl_w(sf, sb); + __m128i uf = __lsx_vilvh_w(sf, sb); + + kernel.packet[0] = __lsx_vilvl_d(u8, u0); + kernel.packet[1] = __lsx_vilvh_d(u8, u0); + kernel.packet[2] = __lsx_vilvl_d(u9, u1); + kernel.packet[3] = __lsx_vilvh_d(u9, u1); + kernel.packet[4] = __lsx_vilvl_d(ua, u2); + kernel.packet[5] = __lsx_vilvh_d(ua, u2); + kernel.packet[6] = __lsx_vilvl_d(ub, u3); + kernel.packet[7] = __lsx_vilvh_d(ub, u3); + kernel.packet[8] = __lsx_vilvl_d(uc, u4); + kernel.packet[9] = __lsx_vilvh_d(uc, u4); + kernel.packet[10] = __lsx_vilvl_d(ud, u5); + kernel.packet[11] = __lsx_vilvh_d(ud, u5); + kernel.packet[12] = __lsx_vilvl_d(ue, u6); + kernel.packet[13] = __lsx_vilvh_d(ue, u6); + kernel.packet[14] = __lsx_vilvl_d(uf, u7); + kernel.packet[15] = __lsx_vilvh_d(uf, u7); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); + + __m128i s0 = __lsx_vilvl_h(t2, t0); + __m128i s1 = __lsx_vilvh_h(t2, t0); + __m128i s2 = __lsx_vilvl_h(t3, t1); + __m128i s3 = __lsx_vilvh_h(t3, t1); + __m128i s4 = __lsx_vilvl_h(t6, t4); + __m128i s5 = __lsx_vilvh_h(t6, t4); + __m128i s6 = __lsx_vilvl_h(t7, t5); + __m128i s7 = __lsx_vilvh_h(t7, t5); + + kernel.packet[0] = __lsx_vilvl_w(s4, s0); + kernel.packet[1] = __lsx_vilvh_w(s4, s0); + kernel.packet[2] = __lsx_vilvl_w(s5, s1); + kernel.packet[3] = __lsx_vilvh_w(s5, s1); + kernel.packet[4] = __lsx_vilvl_w(s6, s2); + kernel.packet[5] = __lsx_vilvh_w(s6, s2); + kernel.packet[6] = __lsx_vilvl_w(s7, s3); + kernel.packet[7] = __lsx_vilvh_w(s7, s3); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_h(t2, t0); + kernel.packet[1] = __lsx_vilvh_h(t2, t0); + kernel.packet[2] = __lsx_vilvl_h(t3, t1); + kernel.packet[3] = __lsx_vilvh_h(t3, t1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); + __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]); + __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]); + __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]); + __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]); + + __m128i s0 = __lsx_vilvl_w(t2, t0); + __m128i s1 = __lsx_vilvh_w(t2, t0); + __m128i s2 = __lsx_vilvl_w(t3, t1); + __m128i s3 = __lsx_vilvh_w(t3, t1); + __m128i s4 = __lsx_vilvl_w(t6, t4); + __m128i s5 = __lsx_vilvh_w(t6, t4); + __m128i s6 = __lsx_vilvl_w(t7, t5); + __m128i s7 = __lsx_vilvh_w(t7, t5); + + kernel.packet[0] = __lsx_vilvl_d(s4, s0); + kernel.packet[1] = __lsx_vilvh_d(s4, s0); + kernel.packet[2] = __lsx_vilvl_d(s5, s1); + kernel.packet[3] = __lsx_vilvh_d(s5, s1); + kernel.packet[4] = __lsx_vilvl_d(s6, s2); + kernel.packet[5] = __lsx_vilvh_d(s6, s2); + kernel.packet[6] = __lsx_vilvl_d(s7, s3); + kernel.packet[7] = __lsx_vilvh_d(s7, s3); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); + __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); + __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); + __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_w(t2, t0); + kernel.packet[1] = __lsx_vilvh_w(t2, t0); + kernel.packet[2] = __lsx_vilvl_w(t3, t1); + kernel.packet[3] = __lsx_vilvh_w(t3, t1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]); + __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]); + __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]); + __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = __lsx_vilvl_d(T2, T0); + kernel.packet[1] = __lsx_vilvh_d(T2, T0); + kernel.packet[2] = __lsx_vilvl_d(T3, T1); + kernel.packet[3] = __lsx_vilvh_d(T3, T1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]); + kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]); + kernel.packet[1] = tmp; +} + +template <> +EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + return __lsx_vfrsqrt_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + return __lsx_vfrsqrt_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { + return __lsx_vfrintrm_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { + return __lsx_vfrintrm_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { + return __lsx_vfrintrp_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { + return __lsx_vfrintrp_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { + const Packet4f mask = pset1frombits(static_cast(0x80000000u)); + const Packet4f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { + const Packet2d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet2d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) { + return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); +} + +template <> +EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) { + int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1), + *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2), + *(from + 3), *(from + 3), *(from + 3), *(from + 3)}; + return __lsx_vld(tmp, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) { + uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1), + *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2), + *(from + 3), *(from + 3), *(from + 3), *(from + 3)}; + return __lsx_vld(tmp, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s ploadquad(const int16_t* from) { + int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)}; + return __lsx_vld(tmp, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us ploadquad(const uint16_t* from) { + uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)}; + return __lsx_vld(tmp, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { + int32_t tmp[4] = {*from, *from, *from, *from}; + return __lsx_vld(tmp, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { + uint32_t tmp[4] = {*from, *from, *from, *from}; + return __lsx_vld(tmp, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) { + return __lsx_vmsub_b(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) { + return __lsx_vmsub_h(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + return __lsx_vmsub_w(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) { + return __lsx_vmsub_d(pnegate(c), a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) { + return __lsx_vmadd_b(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) { + return __lsx_vmadd_h(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + return __lsx_vmadd_w(pnegate(c), a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) { + return __lsx_vmadd_d(pnegate(c), a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) { + return __lsx_vmsub_b(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { + return __lsx_vmsub_h(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + return __lsx_vmsub_w(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) { + return __lsx_vmsub_d(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) { + return pexp_float(_x); +} +template <> +EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) { + return pexp_double(_x); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} +template <> +EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a, exponent); +} +template <> +EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) { + Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f}; + return v; +} +template <> +EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) { + Packet4f v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pmin(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pmax(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { + return (__m128)__lsx_vldrepl_w(from, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { + return (__m128)__lsx_vsrai_w((__m128i)a, 31); +} +template <> +EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + return __lsx_vfrintrne_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f ptrunc(const Packet4f& a) { + return __lsx_vfrintrz_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a) { + return __lsx_vfrecip_s(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) { + Packet2d v = {0.0, 0.0}; + return v; +} +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pmin(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pmax(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { + return (__m128d)(__lsx_vsrai_d((__m128i)a, 63)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { + return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); +} +template <> +EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + return __lsx_vfrintrne_d(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d ptrunc(const Packet2d& a) { + return __lsx_vfrintrz_d(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) { + Packet16c v = psub(a, b); + return pabs(v); +} + +template <> +EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) { + Packet8s v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) { + return __lsx_vbitsel_v(b, a, mask); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) { + Packet4i v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { + return __lsx_vbitsel_v(b, a, mask); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) { + return __lsx_vbitsel_v(b, a, mask); +} + +template <> +EIGEN_STRONG_INLINE Packet16uc pdiv(const Packet16uc& a, const Packet16uc& b) { + return __lsx_vdiv_bu(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) { + Packet16uc v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, + const Packet16uc& b) { + return __lsx_vbitsel_v(b, a, mask); +} +template <> +EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { + __m128i res = {0, 0}; + __m128i add = {0x0808080808080808, 0x0808080808080808}; + for (int i = 0; i < 4; i++) { + const __m128i temp = __lsx_vor_v(res, add); + const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp)); + res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a)); + add = __lsx_vsrli_b(add, 1); + } + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) { + Packet8us v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) { + return __lsx_vbitsel_v(b, a, mask); +} +template <> +EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { + __m128i res = {0, 0}; + __m128i add = {0x0080008000800080, 0x0080008000800080}; + for (int i = 0; i < 4; i++) { + const __m128i temp = __lsx_vor_v(res, add); + const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp)); + res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a)); + add = __lsx_vsrli_h(add, 1); + } + return res; +} + +template <> +EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) { + Packet4ui v = psub(a, b); + return pabs(v); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) { + return __lsx_vbitsel_v(b, a, mask); +} +template <> +EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { + __m128i res = {0, 0}; + __m128i add = {0x0000800000008000, 0x0000800000008000}; + for (int i = 0; i < 4; i++) { + const __m128i temp = __lsx_vor_v(res, add); + const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp)); + res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a)); + add = __lsx_vsrli_w(add, 1); + } + return res; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) { + return __lsx_vbitsel_v(b, a, mask); +} + +} // namespace internal +} // namespace Eigen +#endif diff --git a/Eigen/src/Core/arch/LSX/TypeCasting.h b/Eigen/src/Core/arch/LSX/TypeCasting.h new file mode 100644 index 000000000..cda868067 --- /dev/null +++ b/Eigen/src/Core/arch/LSX/TypeCasting.h @@ -0,0 +1,526 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 Zang Ruochen +// Copyright (C) 2024 XiWei Gu +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_LSX_H +#define EIGEN_TYPE_CASTING_LSX_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return (__m128)((__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return (__m128)((__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return (__m128d)((__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return (__m128d)((__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return (__m128d)((__m128i)a); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return (__m128i)a; +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return (__m128i)a; +} + +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + Packet2d tmp = __lsx_vfcvtl_d_s(a); + return __lsx_vftint_l_d(tmp); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + Packet2d tmp = __lsx_vfcvtl_d_s(a); + return __lsx_vftint_lu_d(tmp); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return __lsx_vftint_w_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return __lsx_vftint_wu_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4f& a, const Packet4f& b) { + return __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4f& a, const Packet4f& b) { + return __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + Packet8s tmp1 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0); + Packet8s tmp2 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(c), __lsx_vftint_w_s(d), 0); + return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + Packet8us tmp1 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0); + Packet8us tmp2 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(c), __lsx_vftint_wu_s(d), 0); + return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0); + return __lsx_vffint_s_w(tmp2); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0); + return __lsx_vsllwil_d_w((__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0); + return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + return __lsx_vsllwil_w_h((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + return (Packet4ui)__lsx_vsllwil_w_h((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16c& a) { + return __lsx_vsllwil_h_b((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { + return (Packet8us)__lsx_vsllwil_h_b((__m128i)a, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0); + return __lsx_vffint_s_wu(tmp2); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0); + return __lsx_vsllwil_du_wu((__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0); + return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + return __lsx_vsllwil_wu_hu((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16uc& a) { + return __lsx_vsllwil_hu_bu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { + return (Packet8s)__lsx_vsllwil_hu_bu((__m128i)a, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8s& a) { + Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0); + return __lsx_vffint_s_w(tmp1); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8s& a) { + Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0); + return __lsx_vsllwil_d_w((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { + Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0); + return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8s& a) { + return __lsx_vsllwil_w_h((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { + return (Packet4ui)__lsx_vsllwil_w_h((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8s& a, const Packet8s& b) { + return __lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8s& a, const Packet8s& b) { + return (Packet16uc)__lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8us& a) { + Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0); + return __lsx_vffint_s_wu(tmp1); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8us& a) { + Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0); + return __lsx_vsllwil_du_wu((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { + Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0); + return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp1, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8us& a) { + return __lsx_vsllwil_wu_hu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { + return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8us& a, const Packet8us& b) { + return __lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { + return (Packet16c)__lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return __lsx_vffint_s_w(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4i& a) { + return __lsx_vsllwil_d_w((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { + return (Packet2ul)__lsx_vsllwil_d_w((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4i& a, const Packet4i& b) { + return __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4i& a, const Packet4i& b) { + return (Packet8us)__lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0); + Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0); + return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0); + Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0); + return (Packet16uc)__lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return __lsx_vffint_s_wu(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4ui& a) { + return __lsx_vsllwil_du_wu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { + return (Packet2l)__lsx_vsllwil_du_wu((__m128i)a, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4ui& a, const Packet4ui& b) { + return __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { + return (Packet8s)__lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0); + Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0); + return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0); + Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0); + return (Packet16c)__lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2l& a, const Packet2l& b) { + return __lsx_vffint_s_w(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2l& a, const Packet2l& b) { + return __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2l& a, const Packet2l& b) { + return (Packet4ui)__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0); + Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0); + return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0); + Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0); + return (Packet8us)__lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const Packet8s abcd = pcast(a, b, c, d); + const Packet8s efgh = pcast(e, f, g, h); + return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const Packet8us abcd = pcast(a, b, c, d); + const Packet8us efgh = pcast(e, f, g, h); + return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vffint_s_wu(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2ul& a, const Packet2ul& b) { + return __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { + return (Packet4i)__lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0); + Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0); + return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0); + Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0); + return (Packet8s)__lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + const Packet8s abcd = pcast(a, b, c, d); + const Packet8s efgh = pcast(e, f, g, h); + return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + const Packet8us abcd = pcast(a, b, c, d); + const Packet8us efgh = pcast(e, f, g, h); + return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return __lsx_vfcvt_s_d(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { + return __lsx_vftint_l_d(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { + return __lsx_vftint_lu_d(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2d& a, const Packet2d& b) { + return __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0); +} +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2d& a, const Packet2d& b) { + return __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0); +} +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + Packet4i tmp1 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0); + Packet4i tmp2 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(c), __lsx_vftint_l_d(d), 0); + return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + Packet4ui tmp1 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0); + Packet4ui tmp2 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(c), __lsx_vftint_lu_d(d), 0); + return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const Packet8s abcd = pcast(a, b, c, d); + const Packet8s efgh = pcast(e, f, g, h); + return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0); +} +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const Packet8us abcd = pcast(a, b, c, d); + const Packet8us efgh = pcast(e, f, g, h); + return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { + return __lsx_vfcvtl_d_s(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { + Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0); + Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0); + return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp2, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { + Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0); + Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0); + return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp2, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { + Packet4i tmp = __lsx_vsllwil_w_h((__m128i)a, 0); + return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { + Packet4ui tmp = __lsx_vsllwil_wu_hu((__m128i)a, 0); + return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4i& a) { + return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)a, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4ui& a) { + return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)a, 0)); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { + return __lsx_vffint_d_l(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { + return __lsx_vffint_d_lu(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_LSX_H diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b65c246e7..b19dd30a5 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1117,7 +1117,7 @@ struct lhs_process_one_packet { // loops on each largest micro horizontal panel of lhs // (LhsProgress x depth) for (Index i = peelStart; i < peelEnd; i += LhsProgress) { -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64 EIGEN_IF_CONSTEXPR(nr >= 8) { for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)]; @@ -1467,7 +1467,7 @@ EIGEN_DONT_INLINE void gebp_kernel= 8) { for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) { @@ -1935,7 +1935,7 @@ EIGEN_DONT_INLINE void gebp_kernel= 8) { for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) { @@ -2326,7 +2326,7 @@ EIGEN_DONT_INLINE void gebp_kernel= 8) { // loop on each panel of the rhs for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { @@ -2852,7 +2852,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs= 8) { for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { // skip what we have before @@ -3035,7 +3035,7 @@ struct gemm_pack_rhs= 4 ? (cols / 4) * 4 : 0; Index count = 0; -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64 EIGEN_IF_CONSTEXPR(nr >= 8) { for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { // skip what we have before diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 223d64f7e..2de9892ce 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -100,8 +100,8 @@ // certain common platform (compiler+architecture combinations) to avoid these problems. // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. -#if EIGEN_COMP_GNUC && \ - !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) +#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \ + EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 @@ -430,6 +430,12 @@ extern "C" { #include #endif +#elif (defined __loongarch64 && defined __loongarch_sx) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_LSX +#include + #elif defined __HVX__ && (__HVX_LENGTH__ == 128) #define EIGEN_VECTORIZE @@ -520,6 +526,8 @@ inline static const char *SimdInstructionSetsInUse(void) { return "S390X ZVECTOR"; #elif defined(EIGEN_VECTORIZE_MSA) return "MIPS MSA"; +#elif defined(EIGEN_VECTORIZE_LSX) + return "LOONGARCH64 LSX"; #else return "None"; #endif diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 4f0b273ce..fcc2db822 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -474,6 +474,7 @@ enum Type { MSA = 0x5, SVE = 0x6, HVX = 0x7, + LSX = 0x8, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -488,6 +489,8 @@ enum Type { Target = MSA #elif defined EIGEN_VECTORIZE_HVX Target = HVX +#elif defined EIGEN_VECTORIZE_LSX + Target = LSX #else Target = Generic #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index fb5605164..1116e5ee4 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -376,6 +376,13 @@ #define EIGEN_ARCH_MIPS 0 #endif +/// \internal EIGEN_ARCH_LOONGARCH64 set to 1 if the architecture is LOONGARCH64 +#if defined(__loongarch64) +#define EIGEN_ARCH_LOONGARCH64 1 +#else +#define EIGEN_ARCH_LOONGARCH64 0 +#endif + /// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC #if defined(__sparc__) || defined(__sparc) #define EIGEN_ARCH_SPARC 1 diff --git a/ci/build.linux.gitlab-ci.yml b/ci/build.linux.gitlab-ci.yml index bfaccb8c3..815bc74a4 100644 --- a/ci/build.linux.gitlab-ci.yml +++ b/ci/build.linux.gitlab-ci.yml @@ -296,6 +296,30 @@ build:linux:cross:ppc64le:clang-12:default: EIGEN_CI_CXX_COMPILER: clang++-12 EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu clang-12 +######## loongarch64 ################################################# + +.build:linux:cross:loongarch64: + extends: .build:linux:cross + variables: + EIGEN_CI_TARGET_ARCH: loongarch64 + EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu + tags: + - eigen-runner + - linux + - x86-64 + +# GCC-14 (minimum on Ubuntu 24) +build:linux:cross:loongarch64:gcc-14:default: + extends: .build:linux:cross:loongarch64 + image: ubuntu:24.04 + variables: + EIGEN_CI_C_COMPILER: gcc-14 + EIGEN_CI_CXX_COMPILER: g++-14 + EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu gcc-14-loongarch64-linux-gnu + EIGEN_CI_CROSS_C_COMPILER: loongarch64-linux-gnu-gcc-14 + EIGEN_CI_CROSS_CXX_COMPILER: loongarch64-linux-gnu-g++-14 + EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_LSX=on" + ######## MR Smoke Tests ######################################################## build:linux:cross:x86-64:gcc-10:default:smoketest: diff --git a/ci/test.linux.gitlab-ci.yml b/ci/test.linux.gitlab-ci.yml index 82a88af6f..4cd38d3dc 100644 --- a/ci/test.linux.gitlab-ci.yml +++ b/ci/test.linux.gitlab-ci.yml @@ -415,6 +415,37 @@ test:linux:ppc64le:clang-12:default:unsupported: variables: EIGEN_CI_TEST_LABEL: Unsupported +##### loongarch64 ################################################################### +.test:linux:loongarch64: + extends: .test:linux + variables: + EIGEN_CI_TARGET_ARCH: loongarch64 + EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu + # Install QEMU and set up the execution environment in the image + EIGEN_CI_BEFORE_SCRIPT: "apt-get update && apt-get install g++-14-loongarch64-linux-gnu gcc-14-loongarch64-linux-gnu qemu-user-static -y && \ + ln -sf /usr/loongarch64-linux-gnu/lib64/ld-linux-loongarch-lp64d.so.1 /lib64/ld-linux-loongarch-lp64d.so.1 && \ + export LD_LIBRARY_PATH=/usr/loongarch64-linux-gnu/lib:$LD_LIBRARY_PAT" + tags: + - eigen-runner + - linux + - x86-64 + +# GCC-14 (Ubuntu 24) +.test:linux:loongarch64:gcc-14:default: + extends: .test:linux:loongarch64 + image: ubuntu:24.04 + needs: [ build:linux:cross:loongarch64:gcc-14:default ] + +test:linux:loongarch64:gcc-14:default:official: + extends: .test:linux:loongarch64:gcc-14:default + variables: + EIGEN_CI_TEST_LABEL: Official + +test:linux:loongarch64:gcc-14:default:unsupported: + extends: .test:linux:loongarch64:gcc-14:default + variables: + EIGEN_CI_TEST_LABEL: Unsupported + ##### MR Smoke Tests ########################################################### test:linux:x86-64:gcc-10:default:smoketest: diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index a1488fc47..199048236 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -367,6 +367,12 @@ macro(ei_testing_print_summary) message(STATUS "S390X ZVECTOR: Using architecture defaults") endif() + if(EIGEN_TEST_LSX) + message(STATUS "LSX: ON") + else() + message(STATUS "LSX: Using architecture defaults") + endif() + if(EIGEN_TEST_SYCL) if(EIGEN_SYCL_TRISYCL) message(STATUS "SYCL: ON (using triSYCL)")