From 84cf3ff18d751cf57a63f4e465b5758fb6d1893f Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 27 Jun 2022 19:18:00 +0000 Subject: [PATCH] Add pload_partial, pstore_partial (and unaligned versions), pgather_partial, pscatter_partial, loadPacketPartial and storePacketPartial. --- Eigen/src/Core/GenericPacketMath.h | 112 ++++- Eigen/src/Core/arch/AltiVec/Complex.h | 79 ++- Eigen/src/Core/arch/AltiVec/PacketMath.h | 583 ++++++++++++++++++----- Eigen/src/Core/util/BlasUtil.h | 40 ++ test/packetmath.cpp | 60 +++ 5 files changed, 745 insertions(+), 129 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 3ea6855eb..8119200ba 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -606,14 +606,46 @@ pldexp(const Packet &a, const Packet &exponent) { template EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } -/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ +/** \internal \returns a packet version of \a *from, from must be properly aligned */ template EIGEN_DEVICE_FUNC inline Packet pload(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned + * offset indicates the starting element in which to load and + * offset + n <= unpacket_traits::size + * All elements before offset and after the last element loaded will initialized with zero */ +template EIGEN_DEVICE_FUNC inline Packet +pload_partial(const typename unpacket_traits::type* from, const Index n, const Index offset = 0) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); + typedef typename unpacket_traits::type Scalar; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) { + elements[i] = from[i-offset]; + } + return pload(elements); +} + /** \internal \returns a packet version of \a *from, (un-aligned load) */ template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns n elements of a packet version of \a *from, (un-aligned load) + * All elements after the last element loaded will initialized with zero */ +template EIGEN_DEVICE_FUNC inline Packet +ploadu_partial(const typename unpacket_traits::type* from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will read past end of packet"); + typedef typename unpacket_traits::type Scalar; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + elements[i] = from[i]; + } + return pload(elements); +} + /** \internal \returns a packet version of \a *from, (un-aligned masked load) * There is no generic implementation. We only have implementations for specialized * cases. Generic case should not be called. @@ -704,14 +736,40 @@ peven_mask(const Packet& /*a*/) { } -/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ +/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned + * offset indicates the starting element in which to store and + * offset + n <= unpacket_traits::size */ +template EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) { + to[i] = elements[i + offset]; + } +} + /** \internal copy the packet \a from to \a *to, (un-aligned store) */ template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */ +template EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will write past end of packet"); + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + to[i] = elements[i]; + } +} + /** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) * There is no generic implementation. We only have implementations for specialized * cases. Generic case should not be called. @@ -721,11 +779,31 @@ EIGEN_DEVICE_FUNC inline std::enable_if_t::masked_store_available, void> pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); - template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) - { return ploadu(from); } +template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) +{ return ploadu(from); } - template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) - { pstore(to, from); } +template EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n) +{ + const Index packet_size = unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + elements[i] = from[i*stride]; + } + return pload(elements); +} + +template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) +{ pstore(to, from); } + +template EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n) +{ + const Index packet_size = unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + to[i*stride] = elements[i]; + } +} /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) @@ -996,6 +1074,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_trai return ploadu(from); } +/** \internal \returns n elements of a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits::type* from, const Index n, const Index offset = 0) +{ + if(Alignment >= unpacket_traits::alignment) + return pload_partial(from, n, offset); + else + return ploadu_partial(from, n); +} + /** \internal copy the packet \a from to \a *to. * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ template @@ -1007,6 +1096,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro pstoreu(to, from); } +/** \internal copy n elements of the packet \a from to \a *to. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) +{ + if(Alignment >= unpacket_traits::alignment) + pstore_partial(to, from, n, offset); + else + pstoreu_partial(to, from, n); +} + /** \internal \returns a packet version of \a *from. * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the * hardware if available to speedup the loading of data that won't be modified diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index ba5a3fddd..60460350a 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -132,10 +132,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { return Packet2cf(pload((const float *) from)); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { return Packet2cf(ploadu((const float*) from)); } +template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial(const std::complex* from, const Index n, const Index offset) +{ + return Packet2cf(pload_partial((const float *) from, n * 2, offset * 2)); +} +template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial(const std::complex* from, const Index n) +{ + return Packet2cf(ploadu_partial((const float*) from, n * 2)); +} template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); } EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { @@ -157,19 +167,46 @@ EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std return Packet2cf(res0); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore(const std::complex* from) { - EIGEN_ALIGN16 std::complex af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + Packet2cf res; + res.v = pload_ignore(reinterpret_cast(from)); + return res; } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2) { - EIGEN_ALIGN16 std::complex af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + eigen_assert(n <= unpacket_traits::size && "number of elements will gather past end of packet"); + EIGEN_ALIGN16 Scalar af[2]; + for (Index i = 0; i < n; i++) { + af[i] = from[i*stride]; + } + return pload_ignore(af); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + return pgather_complex_size2, Packet2cf>(from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial, Packet2cf>(const std::complex* from, Index stride, const Index n) +{ + return pgather_complex_size2, Packet2cf>(from, stride, n); +} +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2) +{ + eigen_assert(n <= unpacket_traits::size && "number of elements will scatter past end of packet"); + EIGEN_ALIGN16 Scalar af[2]; + pstore((Scalar *) af, from); + for (Index i = 0; i < n; i++) { + to[i*stride] = af[i]; + } +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + pscatter_complex_size2, Packet2cf>(to, from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride, const Index n) +{ + pscatter_complex_size2, Packet2cf>(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } @@ -336,17 +373,35 @@ template<> struct unpacket_traits { typedef std::complex type template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial(const std::complex* from, const Index n, const Index offset) +{ + return Packet1cd(pload_partial((const double*)from, n * 2, offset * 2)); +} +template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial(const std::complex* from, const Index n) +{ + return Packet1cd(ploadu_partial((const double*)from, n * 2)); +} template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather, Packet1cd>(const std::complex* from, Index) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial, Packet1cd>(const std::complex* from, Index, const Index) +{ + return pload(from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) +{ + pstore >(to, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial, Packet1cd>(std::complex* to, const Packet1cd& from, Index, const Index) { pstore >(to, from); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 91b3e2063..4dd53f6a4 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -143,6 +143,12 @@ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif +#if EIGEN_COMP_LLVM +#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16") +#else +#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)") +#endif + template <> struct packet_traits : default_packet_traits { typedef Packet4f type; @@ -471,6 +477,118 @@ template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* fr return pload_common(reinterpret_cast(from)); } +template +EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from) +{ + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); + EIGEN_DEBUG_ALIGNED_LOAD + // Ignore partial input memory initialized +#if !EIGEN_COMP_LLVM + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif +#ifdef __VSX__ + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#else + return vec_ld(0, from); +#endif +#if !EIGEN_COMP_LLVM + #pragma GCC diagnostic pop +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore(const bfloat16* from) +{ + return pload_ignore(reinterpret_cast(from)); +} + +template +EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset) +{ + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_DEBUG_ALIGNED_LOAD + EIGEN_UNUSED_VARIABLE(from); + Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + load = Packet(vec_sro(Packet16uc(load), shift)); +#else + load = Packet(vec_slo(Packet16uc(load), shift)); +#endif + } + return load; +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; + unsigned char* load2 = reinterpret_cast(load + offset); + unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstoreu(load2, ploadu(from2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + } + return pload_ignore(load); +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial(const float* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial(const int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial(const short int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial(const unsigned short int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial(const bfloat16* from, const Index n, const Index offset) +{ + return pload_partial_common(reinterpret_cast(from), n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial(const signed char* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial(const unsigned char* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + template EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ // some versions of GCC throw "unused-but-set-parameter" (float *to). @@ -519,6 +637,90 @@ template<> EIGEN_STRONG_INLINE void pstore(unsigned char* t pstore_common(to, from); } +template EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset) +{ + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(to); + EIGEN_DEBUG_ALIGNED_STORE + Packet store = from; + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + store = Packet(vec_slo(Packet16uc(store), shift)); +#else + store = Packet(vec_sro(Packet16uc(store), shift)); +#endif + } + vec_xst_len(store, to, n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; + pstore(store, from); + unsigned char* store2 = reinterpret_cast(store + offset); + unsigned char* to2 = reinterpret_cast(to); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstore(to2, ploadu(store2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + } +#endif +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(float* to, const Packet4f& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(int* to, const Packet4i& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(short int* to, const Packet8s& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(unsigned short int* to, const Packet8us& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(bfloat16* to, const Packet8bf& from, const Index n, const Index offset) +{ + pstore_partial_common(reinterpret_cast(to), from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(signed char* to, const Packet16c& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(unsigned char* to, const Packet16uc& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + template EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) { @@ -596,168 +798,167 @@ pbroadcast4(const int *a, pbroadcast4_common(a, a0, a1, a2, a3); } -template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits::size) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - return pload(a); + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits::size]; + eigen_assert(n <= unpacket_traits::size && "number of elements will gather past end of packet"); + LOAD_STORE_UNROLL_16 + for (Index i = 0; i < n; i++) { + a[i] = from[i*stride]; + } + // Leave rest of the array uninitialized + return pload_ignore(a); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather(const float* from, Index stride) { return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather(const int* from, Index stride) { return pgather_common(from, stride); } -template EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather(const short int* from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - a[4] = from[4*stride]; - a[5] = from[5*stride]; - a[6] = from[6*stride]; - a[7] = from[7*stride]; - return pload(a); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather(const unsigned short int* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather(const bfloat16* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather(const bfloat16* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather(const signed char* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather(const unsigned char* from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - a[4] = from[4*stride]; - a[5] = from[5*stride]; - a[6] = from[6*stride]; - a[7] = from[7*stride]; - a[8] = from[8*stride]; - a[9] = from[9*stride]; - a[10] = from[10*stride]; - a[11] = from[11*stride]; - a[12] = from[12*stride]; - a[13] = from[13*stride]; - a[14] = from[14*stride]; - a[15] = from[15*stride]; - return pload(a); + return pgather_common(from, stride); } - -template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const signed char* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial(const float* from, Index stride, const Index n) { - return pgather_size16(from, stride); + return pgather_common(from, stride, n); } -template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const unsigned char* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial(const int* from, Index stride, const Index n) { - return pgather_size16(from, stride); + return pgather_common(from, stride, n); } -template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial(const short int* from, Index stride, const Index n) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial(const unsigned short int* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial(const bfloat16* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial(const signed char* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial(const unsigned char* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits::size) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits::size]; + eigen_assert(n <= unpacket_traits::size && "number of elements will scatter past end of packet"); pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; + LOAD_STORE_UNROLL_16 + for (Index i = 0; i < n; i++) { + to[i*stride] = a[i]; + } } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(float* to, const Packet4f& from, Index stride) { - pscatter_size4(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(int* to, const Packet4i& from, Index stride) { - pscatter_size4(to, from, stride); + pscatter_common(to, from, stride); } -template EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(short int* to, const Packet8s& from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; - pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; - to[4*stride] = a[4]; - to[5*stride] = a[5]; - to[6*stride] = a[6]; - to[7*stride] = a[7]; + pscatter_common(to, from, stride); } - -template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(unsigned short int* to, const Packet8us& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(bfloat16* to, const Packet8bf& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(signed char* to, const Packet16c& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(unsigned char* to, const Packet16uc& from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; - pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; - to[4*stride] = a[4]; - to[5*stride] = a[5]; - to[6*stride] = a[6]; - to[7*stride] = a[7]; - to[8*stride] = a[8]; - to[9*stride] = a[9]; - to[10*stride] = a[10]; - to[11*stride] = a[11]; - to[12*stride] = a[12]; - to[13*stride] = a[13]; - to[14*stride] = a[14]; - to[15*stride] = a[15]; + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(signed char* to, const Packet16c& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(float* to, const Packet4f& from, Index stride, const Index n) { - pscatter_size16(to, from, stride); + pscatter_common(to, from, stride, n); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned char* to, const Packet16uc& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(int* to, const Packet4i& from, Index stride, const Index n) { - pscatter_size16(to, from, stride); + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(short int* to, const Packet8s& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(unsigned short int* to, const Packet8us& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(bfloat16* to, const Packet8bf& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(signed char* to, const Packet16c& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(unsigned char* to, const Packet16uc& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } @@ -1008,6 +1209,73 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char return ploadu_common(from); } +template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will read past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_DEBUG_ALIGNED_LOAD + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; + unsigned char* load2 = reinterpret_cast(load); + unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstore(load2, ploadu(from2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + } + return pload_ignore(load); +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial(const float* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial(const int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial(const short int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial(const unsigned short int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial(const bfloat16* from, const Index n) +{ + return ploadu_partial_common(reinterpret_cast(from), n); +} +template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial(const signed char* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial(const unsigned char* from, const Index n) +{ + return ploadu_partial_common(from, n); +} + template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) { Packet p; @@ -1128,6 +1396,77 @@ template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* t pstoreu_common(to, from); } +template EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will write past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_DEBUG_UNALIGNED_STORE + vec_xst_len(from, to, n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; + pstore(store, from); + unsigned char* store2 = reinterpret_cast(store); + unsigned char* to2 = reinterpret_cast(to); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstoreu(to2, pload(store2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + } + + LOAD_STORE_UNROLL_16 + for (Index i = 0; i < n; i++) { + to[i] = from[i]; + } +#endif +} + +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(float* to, const Packet4f& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(int* to, const Packet4i& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(short int* to, const Packet8s& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned short int* to, const Packet8us& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(bfloat16* to, const Packet8bf& from, const Index n) +{ + pstoreu_partial_common(reinterpret_cast(to), from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(signed char* to, const Packet16c& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned char* to, const Packet16uc& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } @@ -2387,12 +2726,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) return vec_xl(0, const_cast(from)); // cast needed by Clang } +template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial(const double* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_xst(from, 0, to); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial(double* to, const Packet2d& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { Packet2d v = {from, from}; return v; @@ -2414,19 +2763,21 @@ pbroadcast4(const double *a, a3 = pset1(a[3]); } -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather(const double* from, Index stride) { - EIGEN_ALIGN16 double af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial(const double* from, Index stride, const Index n) { - EIGEN_ALIGN16 double af[2]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + return pgather_common(from, stride, n); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(double* to, const Packet2d& from, Index stride) +{ + pscatter_common(to, from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(double* to, const Packet2d& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return pset1(a) + p2d_COUNTDOWN; } @@ -2517,6 +2868,11 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) return vec_xl(0, const_cast(from)); } +template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial(const double* from, const Index n) +{ + return ploadu_partial_common(from, n); +} + template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p; @@ -2531,6 +2887,11 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& vec_xst(from, 0, to); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(double* to, const Packet2d& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} + template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index e2eef19a9..56473a9ac 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -100,6 +100,11 @@ public: return ploadt(m_data + i); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const { + return ploadt_partial(m_data + i, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const { return ploadt(m_data + i); @@ -110,6 +115,11 @@ public: pstoret(m_data + i, p); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const { + pstoret_partial(m_data + i, p, n, offset); + } + protected: Scalar *m_data; }; @@ -208,6 +218,11 @@ public: return ploadt(&operator()(i, j)); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const { + return ploadt_partial(&operator()(i, j), n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { return ploadt(&operator()(i, j)); @@ -218,6 +233,11 @@ public: pstoret(&operator()(i, j), p); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const { + pstoret_partial(&operator()(i, j), p, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -271,11 +291,21 @@ public: return pgather(m_data + i*m_incr.value(), m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const { + return pgather_partial(m_data + i*m_incr.value(), m_incr.value(), n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { pscatter(m_data + i*m_incr.value(), p, m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const { + pscatter_partial(m_data + i*m_incr.value(), p, m_incr.value(), n); + } + protected: Scalar *m_data; const internal::variable_if_dynamic m_incr; @@ -312,6 +342,11 @@ public: return pgather(&operator()(i, j),m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const { + return pgather_partial(&operator()(i, j),m_incr.value(),n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { return pgather(&operator()(i, j),m_incr.value()); @@ -322,6 +357,11 @@ public: pscatter(&operator()(i, j), p, m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const { + pscatter_partial(&operator()(i, j), p, m_incr.value(), n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 2d8e70871..163ef4721 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -458,6 +458,36 @@ void packetmath() { VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu"); } + for (int M = 0; M < PacketSize; ++M) { + for (int N = 0; N <= PacketSize; ++N) { + for (int j = 0; j < size; ++j) { + data1[j] = internal::random() / RealScalar(PacketSize); + data2[j] = internal::random() / RealScalar(PacketSize); + refvalue = (std::max)(refvalue, numext::abs(data1[j])); + } + + if (M == 0) { + internal::pstore_partial(data2, internal::pload_partial(data1, N), N); + VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN"); + + for (int offset = 0; offset < PacketSize; ++offset) { + internal::pstore_partial(data2, internal::ploadu_partial(data1 + offset, N), N); + VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial"); + } + + for (int offset = 0; offset < PacketSize; ++offset) { + internal::pstoreu_partial(data2 + offset, internal::pload_partial(data1, N), N); + VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial"); + } + } + + if (N + M > PacketSize) continue; // Don't read or write past end of Packet + + internal::pstore_partial(data2, internal::pload_partial(data1, N, M), N, M); + VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN"); + } + } + if (internal::unpacket_traits::masked_load_available) { test::packet_helper::masked_load_available, Packet> h; unsigned long long max_umask = (0x1ull << PacketSize); @@ -1372,6 +1402,36 @@ void packetmath_scatter_gather() { for (int i = 0; i < PacketSize; ++i) { VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather"); } + + for (Index N = 0; N <= PacketSize; ++N) { + for (Index i = 0; i < N; ++i) { + data1[i] = internal::random() / RealScalar(PacketSize); + } + + for (Index i = 0; i < N * 20; ++i) { + buffer[i] = Scalar(0); + } + + packet = internal::pload_partial(data1, N); + internal::pscatter_partial(buffer, packet, stride, N); + + for (Index i = 0; i < N * 20; ++i) { + if ((i % stride) == 0 && i < stride * N) { + VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial"); + } else { + VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial"); + } + } + + for (Index i = 0; i < N * 7; ++i) { + buffer[i] = internal::random() / RealScalar(PacketSize); + } + packet = internal::pgather_partial(buffer, 7, N); + internal::pstore_partial(data1, packet, N); + for (Index i = 0; i < N; ++i) { + VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial"); + } + } } namespace Eigen {