// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2024 Kseniya Zaytseva // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_PACKET_MATH_RVV10_H #define EIGEN_PACKET_MATH_RVV10_H // IWYU pragma: private #include "../../InternalHeaderCheck.h" namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 template struct rvv_packet_size_selector { enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; }; template struct rvv_packet_alignment_selector { enum { alignment = (VectorLength * VectorLMul) >= 1024 ? Aligned128 : ((VectorLength * VectorLMul) >= 512 ? Aligned64 : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16)) }; }; typedef vbool64_t PacketMask64; typedef vbool32_t PacketMask32; typedef vbool16_t PacketMask16; typedef vbool8_t PacketMask8; typedef vbool4_t PacketMask4; /********************************* int32 **************************************/ typedef eigen_packet_wrapper PacketXi; typedef eigen_packet_wrapper PacketXu; typedef eigen_packet_wrapper PacketMul2Xi; typedef eigen_packet_wrapper PacketMul2Xu; typedef eigen_packet_wrapper PacketMul4Xi; typedef eigen_packet_wrapper PacketMul4Xu; template <> struct packet_traits : default_packet_traits { typedef PacketXi type; typedef PacketXi half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xi type; typedef PacketXi half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul4Xi type; typedef PacketMul2Xi half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct unpacket_traits { typedef numext::int32_t type; typedef PacketXi half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int32_t type; typedef PacketXi half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int32_t type; typedef PacketMul2Xi half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) { #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC __builtin_prefetch(addr); #endif } /********************************* PacketXi ************************************/ template <> EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { PacketXi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { return a; } template <> EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { return __riscv_vreinterpret_i32m1( __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { return __riscv_vmv_x_s_i32m1_i32(a); } template <> EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { // Multiply the vector by its reverse PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); PacketXi half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } // Last reduction half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul4Xi ************************************/ template <> EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { PacketMul4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { return __riscv_vreinterpret_i32m4( __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, Index stride) { return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { return __riscv_vmv_x_s_i32m4_i32(a); } template <> EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), unpacket_traits::size); PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul2Xi ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { PacketMul2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { return __riscv_vreinterpret_i32m2( __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, Index stride) { return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { return __riscv_vmv_x_s_i32m2_i32(a); } template <> EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketMul2Xi>::type predux_half_dowto4(const PacketMul4Xi& a) { return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXi>::type predux_half_dowto4(const PacketMul2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); } /********************************* float32 ************************************/ typedef eigen_packet_wrapper PacketXf; typedef eigen_packet_wrapper PacketMul2Xf; typedef eigen_packet_wrapper PacketMul4Xf; template <> struct packet_traits : default_packet_traits { typedef PacketXf type; typedef PacketXf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xf type; typedef PacketXf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul4Xf type; typedef PacketMul2Xf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH }; }; template <> struct unpacket_traits { typedef float type; typedef PacketXf half; // Half not yet implemented typedef PacketXi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef float type; typedef PacketXf half; typedef PacketMul2Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask16 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef float type; typedef PacketMul2Xf half; typedef PacketMul4Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask8 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; /********************************* PacketXf ************************************/ template <> EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf plset(const float& a) { PacketXf idx = __riscv_vfcvt_f_x_v_f32m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { return a; } template <> EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { return __riscv_vfmv_f_s_f32m1_f32(a); } template <> EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { const PacketXf limit = pset1(static_cast(1 << 23)); const PacketXf abs_a = pabs(a); PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); const PacketXf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); const PacketXf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); PacketXf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { PacketXf tmp = print(a); // If greater, subtract one. PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE float predux(const PacketXf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { // Multiply the vector by its reverse PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); PacketXf half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } // Last reduction half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketXf& a, const PacketXf& b) { return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketXf& a, const PacketXf& b) { return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketXf pselect(const PacketMask32& mask, const PacketXf& a, const PacketXf& b) { return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); } /********************************* PacketMul4Xf ************************************/ template <> EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { return __riscv_vfmv_f_s_f32m4_f32(a); } template <> EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { const PacketMul4Xf limit = pset1(static_cast(1 << 23)); const PacketMul4Xf abs_a = pabs(a); PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { PacketMul4Xf tmp = print(a); // If greater, subtract one. PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), unpacket_traits::size); PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { return pldexp_generic(a, exponent); } /********************************* PacketMul2Xf ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { return __riscv_vfmv_f_s_f32m2_f32(a); } template <> EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { const PacketMul2Xf limit = pset1(static_cast(1 << 23)); const PacketMul2Xf abs_a = pabs(a); PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { PacketMul2Xf tmp = print(a); // If greater, subtract one. PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { return pldexp_generic(a, exponent); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketMul2Xf>::type predux_half_dowto4(const PacketMul4Xf& a) { return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXf>::type predux_half_dowto4(const PacketMul2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); } /********************************* int64 **************************************/ typedef eigen_packet_wrapper PacketXl; typedef eigen_packet_wrapper PacketXul; typedef eigen_packet_wrapper PacketMul2Xl; typedef eigen_packet_wrapper PacketMul2Xul; typedef eigen_packet_wrapper PacketMul4Xl; typedef eigen_packet_wrapper PacketMul4Xul; template <> struct packet_traits : default_packet_traits { typedef PacketXl type; typedef PacketXl half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xl type; typedef PacketXl half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul4Xl type; typedef PacketMul2Xl half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct unpacket_traits { typedef numext::int64_t type; typedef PacketXl half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int64_t type; typedef PacketXl half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int64_t type; typedef PacketMul2Xl half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC __builtin_prefetch(addr); #endif } /********************************* PacketXl ************************************/ template <> EIGEN_STRONG_INLINE PacketXl pset1(const numext::int64_t& from) { return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl plset(const numext::int64_t& a) { PacketXl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pzero(const PacketXl& /*a*/) { return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl padd(const PacketXl& a, const PacketXl& b) { return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl psub(const PacketXl& a, const PacketXl& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pnegate(const PacketXl& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pconj(const PacketXl& a) { return a; } template <> EIGEN_STRONG_INLINE PacketXl pmul(const PacketXl& a, const PacketXl& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pdiv(const PacketXl& a, const PacketXl& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pnmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pnmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pmin(const PacketXl& a, const PacketXl& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pmax(const PacketXl& a, const PacketXl& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pcmp_le(const PacketXl& a, const PacketXl& b) { PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pcmp_lt(const PacketXl& a, const PacketXl& b) { PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pcmp_eq(const PacketXl& a, const PacketXl& b) { PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl ptrue(const PacketXl& /*a*/) { return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pand(const PacketXl& a, const PacketXl& b) { return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl por(const PacketXl& a, const PacketXl& b) { return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b) { return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { return __riscv_vreinterpret_i64m1( __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketXl plogical_shift_left(PacketXl a) { return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pload(const numext::int64_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); ; return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketXl& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketXl& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXl pgather(const numext::int64_t* from, Index stride) { return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketXl& from, Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { return __riscv_vmv_x_s_i64m1_i64(a); } template <> EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { // Multiply the vector by its reverse PacketXl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); PacketXl half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. return pfirst(prod); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul4Xl ************************************/ template <> EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { PacketMul4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { return __riscv_vreinterpret_i64m4( __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, Index stride) { return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { return __riscv_vmv_x_s_i64m4_i64(a); } template <> EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), unpacket_traits::size); PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul2Xl ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { PacketMul2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { return __riscv_vreinterpret_i64m2( __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, Index stride) { return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { return __riscv_vmv_x_s_i64m2_i64(a); } template <> EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketMul2Xl>::type predux_half_dowto4(const PacketMul4Xl& a) { return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXl>::type predux_half_dowto4(const PacketMul2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); } /********************************* double ************************************/ typedef eigen_packet_wrapper PacketXd; typedef eigen_packet_wrapper PacketMul2Xd; typedef eigen_packet_wrapper PacketMul4Xd; template <> struct packet_traits : default_packet_traits { typedef PacketXd type; typedef PacketXd half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasLog = 1, HasExp = 1, HasSqrt = 1 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xd type; typedef PacketXd half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasLog = 1, HasExp = 1, HasSqrt = 1 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul4Xd type; typedef PacketMul2Xd half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0, HasCmp = 1, HasDiv = 1, HasFloor = 1, HasRint = 1, HasLog = 1, HasExp = 1, HasSqrt = 1 }; }; template <> struct unpacket_traits { typedef double type; typedef PacketXd half; // Half not yet implemented typedef PacketXl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask64 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef double type; typedef PacketXd half; typedef PacketMul2Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef double type; typedef PacketMul2Xd half; typedef PacketMul4Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask16 packet_mask; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; /********************************* PacketXd ************************************/ template <> EIGEN_STRONG_INLINE PacketXd ptrue(const PacketXd& /*a*/) { return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pzero(const PacketXd& /*a*/) { return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pabs(const PacketXd& a) { return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pset1(const double& from) { return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd plset(const double& a) { PacketXd idx = __riscv_vfcvt_f_x_v_f64m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd padd(const PacketXd& a, const PacketXd& b) { return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd psub(const PacketXd& a, const PacketXd& b) { return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pnegate(const PacketXd& a) { return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pconj(const PacketXd& a) { return a; } template <> EIGEN_STRONG_INLINE PacketXd pmul(const PacketXd& a, const PacketXd& b) { return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pdiv(const PacketXd& a, const PacketXd& b) { return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pnmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pnmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pcmp_le(const PacketXd& a, const PacketXd& b) { PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pcmp_lt(const PacketXd& a, const PacketXd& b) { PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pcmp_eq(const PacketXd& a, const PacketXd& b) { PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const PacketXd& b) { PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); ; return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(double* to, const PacketXd& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketXd& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXd pgather(const double* from, Index stride) { return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketXd& from, Index stride) { __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE double pfirst(const PacketXd& a) { return __riscv_vfmv_f_s_f64m1_f64(a); } template <> EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { const PacketXd limit = pset1(static_cast(1ull << 52)); const PacketXd abs_a = pabs(a); PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); const PacketXd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); const PacketXd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); PacketXd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { PacketXd tmp = print(a); // If greater, subtract one. PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE double predux(const PacketXd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { // Multiply the vector by its reverse PacketXd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); PacketXd half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. return pfirst(prod); } template <> EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketXd& a, const PacketXd& b) { return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketXd& a, const PacketXd& b) { return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketXd pselect(const PacketMask64& mask, const PacketXd& a, const PacketXd& b) { return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); } /********************************* PacketMul4Xd ************************************/ template <> EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { return __riscv_vfmv_f_s_f64m4_f64(a); } template <> EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); const PacketMul4Xd abs_a = pabs(a); PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { PacketMul4Xd tmp = print(a); // If greater, subtract one. PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), unpacket_traits::size); PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { return pldexp_generic(a, exponent); } /********************************* PacketMul2Xd ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { return pmin(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { return pmax(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { return __riscv_vfmv_f_s_f64m2_f64(a); } template <> EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); const PacketMul2Xd abs_a = pabs(a); PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { PacketMul2Xd tmp = print(a); // If greater, subtract one. PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { return pfrexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { return pldexp_generic(a, exponent); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketMul2Xd>::type predux_half_dowto4(const PacketMul4Xd& a) { return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXd>::type predux_half_dowto4(const PacketMul2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); } /********************************* short **************************************/ typedef eigen_packet_wrapper PacketXs; typedef eigen_packet_wrapper PacketXsu; typedef eigen_packet_wrapper PacketMul2Xs; typedef eigen_packet_wrapper PacketMul2Xsu; typedef eigen_packet_wrapper PacketMul4Xs; typedef eigen_packet_wrapper PacketMul4Xsu; template <> struct packet_traits : default_packet_traits { typedef PacketXs type; typedef PacketXs half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xs type; typedef PacketXs half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef PacketMul4Xs type; typedef PacketMul2Xs half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 0, HasBlend = 0, HasReduxp = 0 }; }; template <> struct unpacket_traits { typedef numext::int16_t type; typedef PacketXs half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int16_t type; typedef PacketXs half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> struct unpacket_traits { typedef numext::int16_t type; typedef PacketMul2Xs half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; }; template <> EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC __builtin_prefetch(addr); #endif } /********************************* PacketXs ************************************/ template <> EIGEN_STRONG_INLINE PacketXs pset1(const numext::int16_t& from) { return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs plset(const numext::int16_t& a) { PacketXs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pzero(const PacketXs& /*a*/) { return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs padd(const PacketXs& a, const PacketXs& b) { return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs psub(const PacketXs& a, const PacketXs& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pnegate(const PacketXs& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pconj(const PacketXs& a) { return a; } template <> EIGEN_STRONG_INLINE PacketXs pmul(const PacketXs& a, const PacketXs& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pdiv(const PacketXs& a, const PacketXs& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pnmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pnmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pmin(const PacketXs& a, const PacketXs& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pmax(const PacketXs& a, const PacketXs& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pcmp_le(const PacketXs& a, const PacketXs& b) { PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pcmp_lt(const PacketXs& a, const PacketXs& b) { PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pcmp_eq(const PacketXs& a, const PacketXs& b) { PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs ptrue(const PacketXs& /*a*/) { return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pand(const PacketXs& a, const PacketXs& b) { return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs por(const PacketXs& a, const PacketXs& b) { return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b) { return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { return __riscv_vreinterpret_i16m1( __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketXs plogical_shift_left(PacketXs a) { return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pload(const numext::int16_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs ploadu(const numext::int16_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketXs& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketXs& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXs pgather(const numext::int16_t* from, Index stride) { return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketXs& from, Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { return __riscv_vmv_x_s_i16m1_i16(a); } template <> EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { // Multiply the vector by its reverse PacketXs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); PacketXs half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } // Last reduction half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul4Xs ************************************/ template <> EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { PacketMul4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { return __riscv_vreinterpret_i16m4( __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, Index stride) { return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { return __riscv_vmv_x_s_i16m4_i16(a); } template <> EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { PacketMul4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), unpacket_traits::size); PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* PacketMul2Xs ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { PacketMul2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { return __riscv_vneg(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { return a; } template <> EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { return __riscv_vreinterpret_i16m2( __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, Index stride) { return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { return __riscv_vmv_x_s_i16m2_i16(a); } template <> EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { PacketMul2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketMul2Xs>::type predux_half_dowto4(const PacketMul4Xs& a) { return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXs>::type predux_half_dowto4(const PacketMul2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); } } // namespace internal } // namespace Eigen #endif // EIGEN_PACKET_MATH_RVV10_H