Add LoongArch64 architecture LSX support.(build/test )

This commit is contained in:
Pengzhou0810 2025-01-20 18:37:44 +00:00 committed by Charles Schlosser
parent c486af5ad3
commit e986838464
14 changed files with 4078 additions and 8 deletions

View File

@ -486,6 +486,12 @@ if (EIGEN_BUILD_TESTING)
message(STATUS "Enabling MSA in tests/examples")
endif()
option(EIGEN_TEST_LSX "Enable/Disable LSX in tests/examples" OFF)
if(EIGEN_TEST_LSX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx")
message(STATUS "Enabling LSX in tests/examples")
endif()
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
if(EIGEN_TEST_NEON)
if(EIGEN_TEST_FMA)

View File

@ -234,6 +234,11 @@ using std::ptrdiff_t;
#include "src/Core/arch/NEON/TypeCasting.h"
#include "src/Core/arch/NEON/MathFunctions.h"
#include "src/Core/arch/NEON/Complex.h"
#elif defined EIGEN_VECTORIZE_LSX
#include "src/Core/arch/LSX/PacketMath.h"
#include "src/Core/arch/LSX/TypeCasting.h"
#include "src/Core/arch/LSX/MathFunctions.h"
#include "src/Core/arch/LSX/Complex.h"
#elif defined EIGEN_VECTORIZE_SVE
#include "src/Core/arch/SVE/PacketMath.h"
#include "src/Core/arch/SVE/TypeCasting.h"
@ -381,6 +386,8 @@ using std::ptrdiff_t;
#include "src/Core/arch/AltiVec/MatrixProduct.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
#elif defined EIGEN_VECTORIZE_LSX
#include "src/Core/arch/LSX/GeneralBlockPanelKernel.h"
#endif
#if defined(EIGEN_VECTORIZE_AVX512)

View File

@ -0,0 +1,520 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// copyright (c) 2023 zang ruochen <zangruochen@loongson.cn>
// copyright (c) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_LSX_H
#define EIGEN_COMPLEX_LSX_H
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
Packet4f v;
};
template <>
struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasSqrt = 1,
HasExp = 1,
HasAbs = 0,
HasLog = 1,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
typedef Packet2cf half;
typedef Packet4f as_real;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return Packet2cf(__lsx_vfadd_s(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return Packet2cf(__lsx_vfsub_s(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
const uint32_t b[4] = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
Packet4i mask = (Packet4i)__lsx_vld(b, 0);
Packet2cf res;
res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
const uint32_t b[4] = {0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u};
Packet4i mask = (__m128i)__lsx_vld(b, 0);
Packet2cf res;
res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet4f part0_tmp = (Packet4f)__lsx_vfmul_s(a.v, b.v);
Packet4f part0 = __lsx_vfsub_s(part0_tmp, (__m128)__lsx_vshuf4i_w(part0_tmp, 0x31));
Packet4f part1_tmp = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(a.v, 0xb1), b.v);
Packet4f part1 = __lsx_vfadd_s(part1_tmp, (__m128)__lsx_vshuf4i_w(part1_tmp, 0x31));
Packet2cf res;
res.v = (Packet4f)__lsx_vpackev_w((__m128i)part1, (__m128i)part0);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
return Packet2cf(ptrue(Packet4f(a.v)));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
res.v = (Packet4f)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
res.v = (Packet4f)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
res.v = (Packet4f)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
float f0 = from.real(), f1 = from.imag();
Packet4f re = {f0, f0, f0, f0};
Packet4f im = {f1, f1, f1, f1};
return Packet2cf((Packet4f)__lsx_vilvl_w((__m128i)im, (__m128i)re));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
return pset1<Packet2cf>(*from);
}
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
}
template <>
EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
Index stride) {
Packet2cf res;
__m128i tmp = __lsx_vldrepl_d(from, 0);
__m128i tmp1 = __lsx_vldrepl_d(from + stride, 0);
tmp = __lsx_vilvl_d(tmp1, tmp);
res.v = (__m128)tmp;
return res;
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
Index stride) {
__lsx_vstelm_d((__m128i)from.v, to, 0, 0);
__lsx_vstelm_d((__m128i)from.v, to + stride, 0, 1);
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
__builtin_prefetch(addr);
}
template <>
EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_ALIGN16 std::complex<float> res[2];
__lsx_vst(a.v, res, 0);
return res[0];
}
template <>
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
Packet2cf res;
res.v = (Packet4f)__lsx_vshuf4i_w(a.v, 0x4e);
return res;
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
return pfirst(Packet2cf(__lsx_vfadd_s(a.v, vec4f_movehl(a.v, a.v))));
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
return pfirst(pmul(a, Packet2cf(vec4f_movehl(a.v, a.v))));
}
EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
}
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
template <>
EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
return plog_complex(a);
}
template <>
EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) {
__m128 v = {0.0f, 0.0f, 0.0f, 0.0f};
return (Packet2cf)v;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
Packet2cf result, t0, t1, t2;
t1 = pzero(t1);
t0.v = (__m128)__lsx_vpackev_w((__m128i)a.v, (__m128i)a.v);
t2.v = __lsx_vfmadd_s(t0.v, b.v, c.v);
result.v = __lsx_vfadd_s(t2.v, t1.v);
t1.v = __lsx_vfsub_s(t1.v, a.v);
t1.v = (__m128)__lsx_vpackod_w((__m128i)a.v, (__m128i)t1.v);
t2.v = (__m128)__lsx_vshuf4i_w((__m128i)b.v, 0xb1);
result.v = __lsx_vfmadd_s(t1.v, t2.v, result.v);
return result;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
return pexp_complex(a);
}
//---------- double ----------
struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
Packet2d v;
};
template <>
struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasSqrt = 1,
HasAbs = 0,
HasLog = 1,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
typedef Packet1cd half;
typedef Packet2d as_real;
enum {
size = 1,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return Packet1cd(__lsx_vfadd_d(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return Packet1cd(__lsx_vfsub_d(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
return Packet1cd(pnegate(Packet2d(a.v)));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
const uint64_t tmp[2] = {0x0000000000000000u, 0x8000000000000000u};
__m128i mask = __lsx_vld(tmp, 0);
Packet1cd res;
res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, mask);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet2d tmp_real = __lsx_vfmul_d(a.v, b.v);
Packet2d real = __lsx_vfsub_d(tmp_real, preverse(tmp_real));
Packet2d tmp_imag = __lsx_vfmul_d(preverse(a.v), b.v);
Packet2d imag = (__m128d)__lsx_vfadd_d((__m128d)tmp_imag, preverse(tmp_imag));
Packet1cd res;
res.v = (__m128d)__lsx_vilvl_d((__m128i)imag, (__m128i)real);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
return Packet1cd(ptrue(Packet2d(a.v)));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet1cd res;
res.v = (Packet2d)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet1cd res;
res.v = (Packet2d)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet1cd res;
res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet1cd res;
res.v = (Packet2d)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
return res;
}
// FIXME force unaligned load, this is a temporary fix
template <>
EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd
pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
return ploadu<Packet1cd>(&from);
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
return pset1<Packet1cd>(*from);
}
// FIXME force unaligned store, this is a temporary fix
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
__builtin_prefetch(addr);
}
template <>
EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_ALIGN16 double res[2];
__lsx_vst(a.v, res, 0);
return std::complex<double>(res[0], res[1]);
}
template <>
EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
return a;
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
template <>
EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return pdiv_complex(a, b);
}
EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
return Packet1cd(preverse(Packet2d(x.v)));
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
Packet4f tmp1 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
Packet4f tmp2 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
kernel.packet[0].v = (Packet4f)__lsx_vshuf4i_w(tmp1, 0xd8);
kernel.packet[1].v = (Packet4f)__lsx_vshuf4i_w(tmp2, 0xd8);
}
template <>
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
Packet4f eq = (Packet4f)__lsx_vfcmp_ceq_s(a.v, b.v);
return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
Packet2d eq = (Packet2d)__lsx_vfcmp_ceq_d(a.v, b.v);
return Packet1cd(pand<Packet2d>(eq, preverse(eq)));
}
template <>
EIGEN_DEVICE_FUNC inline Packet2cf pselect(const Packet2cf& mask, const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
res.v = (Packet4f)__lsx_vbitsel_v((__m128i)b.v, (__m128i)a.v, (__m128i)mask.v);
return res;
}
template <>
EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
return psqrt_complex<Packet1cd>(a);
}
template <>
EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
return psqrt_complex<Packet2cf>(a);
}
template <>
EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
return plog_complex(a);
}
template <>
EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /* a */) {
__m128d v = {0.0, 0.0};
return (Packet1cd)v;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pmadd<Packet1cd>(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
Packet1cd result, t0, t1, t2;
t1 = pzero(t1);
t0.v = (__m128d)__lsx_vpackev_d((__m128i)a.v, (__m128i)a.v);
t2.v = __lsx_vfmadd_d(t0.v, b.v, c.v);
result.v = __lsx_vfadd_d(t2.v, t1.v);
t1.v = __lsx_vfsub_d(t1.v, a.v);
t1.v = (__m128d)__lsx_vpackod_d((__m128i)a.v, (__m128i)t1.v);
t2.v = (__m128d)__lsx_vshuf4i_d((__m128i)t2.v, (__m128i)b.v, 0xb);
result.v = __lsx_vfmadd_d(t1.v, t2.v, result.v);
return result;
}
template <>
EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
Index /* stride */) {
Packet1cd res;
__m128i tmp = __lsx_vld((void*)from, 0);
res.v = (__m128d)tmp;
return res;
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
Index /* stride */) {
__lsx_vst((__m128i)from.v, (void*)to, 0);
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
Packet2d tmp = (__m128d)__lsx_vilvl_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
kernel.packet[1].v = (__m128d)__lsx_vilvh_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
kernel.packet[0].v = tmp;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_LSX_H

View File

@ -0,0 +1,23 @@
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
#ifndef EIGEN_LSX_GEBP_NR
#define EIGEN_LSX_GEBP_NR 8
#endif
template <>
struct gebp_traits<float, float, false, false, Architecture::LSX, GEBPPacketFull>
: gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
enum { nr = EIGEN_LSX_GEBP_NR };
};
template <>
struct gebp_traits<double, double, false, false, Architecture::LSX, GEBPPacketFull>
: gebp_traits<double, double, false, false, Architecture::Generic, GEBPPacketFull> {
enum { nr = EIGEN_LSX_GEBP_NR };
};
} // namespace internal
} // namespace Eigen

View File

@ -0,0 +1,43 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2024 XiWei Gu (guxiwei-hf@loongson.cn)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_LSX_H
#define EIGEN_MATH_FUNCTIONS_LSX_H
/* The sin and cos functions of this file are loosely derived from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
EIGEN_FLOAT_PACKET_FUNCTION(tanh, Packet4f)
EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_LSX_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,526 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_LSX_H
#define EIGEN_TYPE_CASTING_LSX_H
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
//==============================================================================
// preinterpret
//==============================================================================
template <>
EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
return (__m128)((__m128i)a);
}
template <>
EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
return (__m128)((__m128i)a);
}
template <>
EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
return (__m128d)((__m128i)a);
}
template <>
EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
return (__m128d)((__m128i)a);
}
template <>
EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
return (__m128d)((__m128i)a);
}
template <>
EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
return (__m128i)a;
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
Packet2d tmp = __lsx_vfcvtl_d_s(a);
return __lsx_vftint_l_d(tmp);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
Packet2d tmp = __lsx_vfcvtl_d_s(a);
return __lsx_vftint_lu_d(tmp);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return __lsx_vftint_w_s(a);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
return __lsx_vftint_wu_s(a);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
return __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
return __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
const Packet4f& d) {
Packet8s tmp1 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
Packet8s tmp2 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(c), __lsx_vftint_w_s(d), 0);
return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
const Packet4f& d) {
Packet8us tmp1 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
Packet8us tmp2 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(c), __lsx_vftint_wu_s(d), 0);
return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
return __lsx_vffint_s_w(tmp2);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
return __lsx_vsllwil_d_w((__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
return __lsx_vsllwil_w_h((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
return (Packet4ui)__lsx_vsllwil_w_h((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
return __lsx_vsllwil_h_b((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
return (Packet8us)__lsx_vsllwil_h_b((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
return __lsx_vffint_s_wu(tmp2);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
return __lsx_vsllwil_du_wu((__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
return __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
return __lsx_vsllwil_hu_bu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
return (Packet8s)__lsx_vsllwil_hu_bu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
return __lsx_vffint_s_w(tmp1);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
return __lsx_vsllwil_d_w((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
return __lsx_vsllwil_w_h((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
return (Packet4ui)__lsx_vsllwil_w_h((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
return __lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
return (Packet16uc)__lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
return __lsx_vffint_s_wu(tmp1);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
return __lsx_vsllwil_du_wu((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp1, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
return __lsx_vsllwil_wu_hu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
return __lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
return (Packet16c)__lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return __lsx_vffint_s_w(a);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
return __lsx_vsllwil_d_w((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
return (Packet2ul)__lsx_vsllwil_d_w((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
return __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
return (Packet8us)__lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
const Packet4i& d) {
Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
const Packet4i& d) {
Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
return (Packet16uc)__lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
return __lsx_vffint_s_wu(a);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
return __lsx_vsllwil_du_wu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
return (Packet2l)__lsx_vsllwil_du_wu((__m128i)a, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
return __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
return (Packet8s)__lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
const Packet4ui& d) {
Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
const Packet4ui& d) {
Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
return (Packet16c)__lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
return __lsx_vffint_s_w(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
return __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
return (Packet4ui)__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
const Packet2l& d) {
Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
const Packet2l& d) {
Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
return (Packet8us)__lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
const Packet2l& d, const Packet2l& e, const Packet2l& f,
const Packet2l& g, const Packet2l& h) {
const Packet8s abcd = pcast<Packet2l, Packet8s>(a, b, c, d);
const Packet8s efgh = pcast<Packet2l, Packet8s>(e, f, g, h);
return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
const Packet2l& d, const Packet2l& e, const Packet2l& f,
const Packet2l& g, const Packet2l& h) {
const Packet8us abcd = pcast<Packet2l, Packet8us>(a, b, c, d);
const Packet8us efgh = pcast<Packet2l, Packet8us>(e, f, g, h);
return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
return __lsx_vffint_s_wu(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
return __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
return (Packet4i)__lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
const Packet2ul& d) {
Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
const Packet2ul& d) {
Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
return (Packet8s)__lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
const Packet2ul& g, const Packet2ul& h) {
const Packet8s abcd = pcast<Packet2ul, Packet8s>(a, b, c, d);
const Packet8s efgh = pcast<Packet2ul, Packet8s>(e, f, g, h);
return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
const Packet2ul& g, const Packet2ul& h) {
const Packet8us abcd = pcast<Packet2ul, Packet8us>(a, b, c, d);
const Packet8us efgh = pcast<Packet2ul, Packet8us>(e, f, g, h);
return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return __lsx_vfcvt_s_d(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
return __lsx_vftint_l_d(a);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
return __lsx_vftint_lu_d(a);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
return __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
return __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
}
template <>
EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
const Packet2d& d) {
Packet4i tmp1 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
Packet4i tmp2 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(c), __lsx_vftint_l_d(d), 0);
return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
const Packet2d& d) {
Packet4ui tmp1 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
Packet4ui tmp2 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(c), __lsx_vftint_lu_d(d), 0);
return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
const Packet2d& d, const Packet2d& e, const Packet2d& f,
const Packet2d& g, const Packet2d& h) {
const Packet8s abcd = pcast<Packet2d, Packet8s>(a, b, c, d);
const Packet8s efgh = pcast<Packet2d, Packet8s>(e, f, g, h);
return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
const Packet2d& d, const Packet2d& e, const Packet2d& f,
const Packet2d& g, const Packet2d& h) {
const Packet8us abcd = pcast<Packet2d, Packet8us>(a, b, c, d);
const Packet8us efgh = pcast<Packet2d, Packet8us>(e, f, g, h);
return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
return __lsx_vfcvtl_d_s(a);
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp2, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp2, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
Packet4i tmp = __lsx_vsllwil_w_h((__m128i)a, 0);
return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
Packet4ui tmp = __lsx_vsllwil_wu_hu((__m128i)a, 0);
return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)a, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)a, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
return __lsx_vffint_d_l(a);
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
return __lsx_vffint_d_lu(a);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_LSX_H

View File

@ -1117,7 +1117,7 @@ struct lhs_process_one_packet {
// loops on each largest micro horizontal panel of lhs
// (LhsProgress x depth)
for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
@ -1467,7 +1467,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
(depth * sizeof(LhsScalar) * 3 * LhsProgress)));
for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
@ -1935,7 +1935,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
@ -2326,7 +2326,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
}
//---------- Process remaining rows, 1 at once ----------
if (peeled_mc_quarter < rows) {
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
// loop on each panel of the rhs
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
@ -2852,7 +2852,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
Index count = 0;
const Index peeled_k = (depth / PacketSize) * PacketSize;
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
// skip what we have before
@ -3035,7 +3035,7 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMo
Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
Index count = 0;
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
EIGEN_IF_CONSTEXPR(nr >= 8) {
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
// skip what we have before

View File

@ -100,8 +100,8 @@
// certain common platform (compiler+architecture combinations) to avoid these problems.
// Only static alignment is really problematic (relies on nonstandard compiler extensions),
// try to keep heap alignment even when we have to disable static alignment.
#if EIGEN_COMP_GNUC && \
!(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \
EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64)
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
#else
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
@ -430,6 +430,12 @@ extern "C" {
#include <msa.h>
#endif
#elif (defined __loongarch64 && defined __loongarch_sx)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_LSX
#include <lsxintrin.h>
#elif defined __HVX__ && (__HVX_LENGTH__ == 128)
#define EIGEN_VECTORIZE
@ -520,6 +526,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
return "S390X ZVECTOR";
#elif defined(EIGEN_VECTORIZE_MSA)
return "MIPS MSA";
#elif defined(EIGEN_VECTORIZE_LSX)
return "LOONGARCH64 LSX";
#else
return "None";
#endif

View File

@ -474,6 +474,7 @@ enum Type {
MSA = 0x5,
SVE = 0x6,
HVX = 0x7,
LSX = 0x8,
#if defined EIGEN_VECTORIZE_SSE
Target = SSE
#elif defined EIGEN_VECTORIZE_ALTIVEC
@ -488,6 +489,8 @@ enum Type {
Target = MSA
#elif defined EIGEN_VECTORIZE_HVX
Target = HVX
#elif defined EIGEN_VECTORIZE_LSX
Target = LSX
#else
Target = Generic
#endif

View File

@ -376,6 +376,13 @@
#define EIGEN_ARCH_MIPS 0
#endif
/// \internal EIGEN_ARCH_LOONGARCH64 set to 1 if the architecture is LOONGARCH64
#if defined(__loongarch64)
#define EIGEN_ARCH_LOONGARCH64 1
#else
#define EIGEN_ARCH_LOONGARCH64 0
#endif
/// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
#if defined(__sparc__) || defined(__sparc)
#define EIGEN_ARCH_SPARC 1

View File

@ -296,6 +296,30 @@ build:linux:cross:ppc64le:clang-12:default:
EIGEN_CI_CXX_COMPILER: clang++-12
EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu clang-12
######## loongarch64 #################################################
.build:linux:cross:loongarch64:
extends: .build:linux:cross
variables:
EIGEN_CI_TARGET_ARCH: loongarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
tags:
- eigen-runner
- linux
- x86-64
# GCC-14 (minimum on Ubuntu 24)
build:linux:cross:loongarch64:gcc-14:default:
extends: .build:linux:cross:loongarch64
image: ubuntu:24.04
variables:
EIGEN_CI_C_COMPILER: gcc-14
EIGEN_CI_CXX_COMPILER: g++-14
EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu gcc-14-loongarch64-linux-gnu
EIGEN_CI_CROSS_C_COMPILER: loongarch64-linux-gnu-gcc-14
EIGEN_CI_CROSS_CXX_COMPILER: loongarch64-linux-gnu-g++-14
EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_LSX=on"
######## MR Smoke Tests ########################################################
build:linux:cross:x86-64:gcc-10:default:smoketest:

View File

@ -415,6 +415,37 @@ test:linux:ppc64le:clang-12:default:unsupported:
variables:
EIGEN_CI_TEST_LABEL: Unsupported
##### loongarch64 ###################################################################
.test:linux:loongarch64:
extends: .test:linux
variables:
EIGEN_CI_TARGET_ARCH: loongarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
# Install QEMU and set up the execution environment in the image
EIGEN_CI_BEFORE_SCRIPT: "apt-get update && apt-get install g++-14-loongarch64-linux-gnu gcc-14-loongarch64-linux-gnu qemu-user-static -y && \
ln -sf /usr/loongarch64-linux-gnu/lib64/ld-linux-loongarch-lp64d.so.1 /lib64/ld-linux-loongarch-lp64d.so.1 && \
export LD_LIBRARY_PATH=/usr/loongarch64-linux-gnu/lib:$LD_LIBRARY_PAT"
tags:
- eigen-runner
- linux
- x86-64
# GCC-14 (Ubuntu 24)
.test:linux:loongarch64:gcc-14:default:
extends: .test:linux:loongarch64
image: ubuntu:24.04
needs: [ build:linux:cross:loongarch64:gcc-14:default ]
test:linux:loongarch64:gcc-14:default:official:
extends: .test:linux:loongarch64:gcc-14:default
variables:
EIGEN_CI_TEST_LABEL: Official
test:linux:loongarch64:gcc-14:default:unsupported:
extends: .test:linux:loongarch64:gcc-14:default
variables:
EIGEN_CI_TEST_LABEL: Unsupported
##### MR Smoke Tests ###########################################################
test:linux:x86-64:gcc-10:default:smoketest:

View File

@ -367,6 +367,12 @@ macro(ei_testing_print_summary)
message(STATUS "S390X ZVECTOR: Using architecture defaults")
endif()
if(EIGEN_TEST_LSX)
message(STATUS "LSX: ON")
else()
message(STATUS "LSX: Using architecture defaults")
endif()
if(EIGEN_TEST_SYCL)
if(EIGEN_SYCL_TRISYCL)
message(STATUS "SYCL: ON (using triSYCL)")