Add pload_partial, pstore_partial (and unaligned versions), pgather_partial, pscatter_partial, loadPacketPartial and storePacketPartial.

This commit is contained in:
Chip Kerchner 2022-06-27 19:18:00 +00:00 committed by Rasmus Munk Larsen
parent c603275dc9
commit 84cf3ff18d
5 changed files with 745 additions and 129 deletions

View File

@ -606,14 +606,46 @@ pldexp(const Packet &a, const Packet &exponent) {
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ /** \internal \returns a packet version of \a *from, from must be properly aligned */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload(const typename unpacket_traits<Packet>::type* from) { return *from; } pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned
* offset indicates the starting element in which to load and
* offset + n <= unpacket_traits::size
* All elements before offset and after the last element loaded will initialized with zero */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
typedef typename unpacket_traits<Packet>::type Scalar;
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
elements[i] = from[i-offset];
}
return pload<Packet>(elements);
}
/** \internal \returns a packet version of \a *from, (un-aligned load) */ /** \internal \returns a packet version of \a *from, (un-aligned load) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
* All elements after the last element loaded will initialized with zero */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will read past end of packet");
typedef typename unpacket_traits<Packet>::type Scalar;
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
elements[i] = from[i];
}
return pload<Packet>(elements);
}
/** \internal \returns a packet version of \a *from, (un-aligned masked load) /** \internal \returns a packet version of \a *from, (un-aligned masked load)
* There is no generic implementation. We only have implementations for specialized * There is no generic implementation. We only have implementations for specialized
* cases. Generic case should not be called. * cases. Generic case should not be called.
@ -704,14 +736,40 @@ peven_mask(const Packet& /*a*/) {
} }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ /** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
{ (*to) = from; } { (*to) = from; }
/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned
* offset indicates the starting element in which to store and
* offset + n <= unpacket_traits::size */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
EIGEN_ALIGN_MAX Scalar elements[packet_size];
pstore<Scalar>(elements, from);
for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
to[i] = elements[i + offset];
}
}
/** \internal copy the packet \a from to \a *to, (un-aligned store) */ /** \internal copy the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
{ (*to) = from; } { (*to) = from; }
/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will write past end of packet");
EIGEN_ALIGN_MAX Scalar elements[packet_size];
pstore<Scalar>(elements, from);
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
to[i] = elements[i];
}
}
/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) /** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
* There is no generic implementation. We only have implementations for specialized * There is no generic implementation. We only have implementations for specialized
* cases. Generic case should not be called. * cases. Generic case should not be called.
@ -724,9 +782,29 @@ pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
{ return ploadu<Packet>(from); } { return ploadu<Packet>(from); }
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
elements[i] = from[i*stride];
}
return pload<Packet>(elements);
}
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
{ pstore(to, from); } { pstore(to, from); }
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX Scalar elements[packet_size];
pstore<Scalar>(elements, from);
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
to[i*stride] = elements[i];
}
}
/** \internal tries to do cache prefetching of \a addr */ /** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{ {
@ -996,6 +1074,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_trai
return ploadu<Packet>(from); return ploadu<Packet>(from);
} }
/** \internal \returns n elements of a packet version of \a *from.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Packet, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
{
if(Alignment >= unpacket_traits<Packet>::alignment)
return pload_partial<Packet>(from, n, offset);
else
return ploadu_partial<Packet>(from, n);
}
/** \internal copy the packet \a from to \a *to. /** \internal copy the packet \a from to \a *to.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Scalar, typename Packet, int Alignment> template<typename Scalar, typename Packet, int Alignment>
@ -1007,6 +1096,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
pstoreu(to, from); pstoreu(to, from);
} }
/** \internal copy n elements of the packet \a from to \a *to.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
template<typename Scalar, typename Packet, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
{
if(Alignment >= unpacket_traits<Packet>::alignment)
pstore_partial(to, from, n, offset);
else
pstoreu_partial(to, from, n);
}
/** \internal \returns a packet version of \a *from. /** \internal \returns a packet version of \a *from.
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
* hardware if available to speedup the loading of data that won't be modified * hardware if available to speedup the loading of data that won't be modified

View File

@ -132,10 +132,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); } template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
{
return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
}
template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n)
{
return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2));
}
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); }
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1) EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{ {
@ -157,19 +167,46 @@ EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std
return Packet2cf(res0); return Packet2cf(res0);
} }
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from)
{ {
EIGEN_ALIGN16 std::complex<float> af[2]; Packet2cf res;
af[0] = from[0*stride]; res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
af[1] = from[1*stride]; return res;
return pload<Packet2cf>(af);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
{ {
EIGEN_ALIGN16 std::complex<float> af[2]; eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
pstore<std::complex<float> >((std::complex<float> *) af, from); EIGEN_ALIGN16 Scalar af[2];
to[0*stride] = af[0]; for (Index i = 0; i < n; i++) {
to[1*stride] = af[1]; af[i] = from[i*stride];
}
return pload_ignore<Packet>(af);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
{
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
}
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
{
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
EIGEN_ALIGN16 Scalar af[2];
pstore<Scalar>((Scalar *) af, from);
for (Index i = 0; i < n; i++) {
to[i*stride] = af[i];
}
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
{
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
} }
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
@ -336,17 +373,35 @@ template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
{
return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
}
template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n)
{
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2));
}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
{ {
return pload<Packet1cd>(from); return pload<Packet1cd>(from);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
{
return pload<Packet1cd>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
{
pstore<std::complex<double> >(to, from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
{ {
pstore<std::complex<double> >(to, from); pstore<std::complex<double> >(to, from);
} }

View File

@ -143,6 +143,12 @@ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#endif #endif
#if EIGEN_COMP_LLVM
#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
#else
#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
#endif
template <> template <>
struct packet_traits<float> : default_packet_traits { struct packet_traits<float> : default_packet_traits {
typedef Packet4f type; typedef Packet4f type;
@ -471,6 +477,118 @@ template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* fr
return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
} }
template <typename Packet>
EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
{
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
EIGEN_DEBUG_ALIGNED_LOAD
// Ignore partial input memory initialized
#if !EIGEN_COMP_LLVM
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#ifdef __VSX__
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
return vec_ld(0, from);
#endif
#if !EIGEN_COMP_LLVM
#pragma GCC diagnostic pop
#endif
}
template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from)
{
return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
template <typename Packet>
EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
{
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_UNUSED_VARIABLE(from);
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
load = Packet(vec_sro(Packet16uc(load), shift));
#else
load = Packet(vec_slo(Packet16uc(load), shift));
#endif
}
return load;
#else
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size;
Index i = 0;
if (16 <= n2) {
pstoreu(load2, ploadu<Packet16uc>(from2));
i += 16;
}
if (i + 8 <= n2) {
*reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
i += 8;
}
if (i + 4 <= n2) {
*reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
i += 4;
}
if (i + 2 <= n2) {
*reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
i += 2;
}
if (i < n2) {
*reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
}
return pload_ignore<Packet>(load);
#endif
}
template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
{
return pload_partial_common<Packet4f>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
{
return pload_partial_common<Packet4i>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
{
return pload_partial_common<Packet8s>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
{
return pload_partial_common<Packet8us>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
{
return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
{
return pload_partial_common<Packet16c>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
{
return pload_partial_common<Packet16uc>(from, n, offset);
}
template <typename Packet> template <typename Packet>
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
// some versions of GCC throw "unused-but-set-parameter" (float *to). // some versions of GCC throw "unused-but-set-parameter" (float *to).
@ -519,6 +637,90 @@ template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* t
pstore_common<Packet16uc>(to, from); pstore_common<Packet16uc>(to, from);
} }
template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
{
// some versions of GCC throw "unused-but-set-parameter" (float *to).
// ignoring these warnings for now.
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(to);
EIGEN_DEBUG_ALIGNED_STORE
Packet store = from;
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
store = Packet(vec_slo(Packet16uc(store), shift));
#else
store = Packet(vec_sro(Packet16uc(store), shift));
#endif
}
vec_xst_len(store, to, n * size);
#else
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from);
unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
Index n2 = n * size;
Index i = 0;
if (16 <= n2) {
pstore(to2, ploadu<Packet16uc>(store2));
i += 16;
}
if (i + 8 <= n2) {
*reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
i += 8;
}
if (i + 4 <= n2) {
*reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
i += 4;
}
if (i + 2 <= n2) {
*reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
i += 2;
}
if (i < n2) {
*reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
}
#endif
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
{
pstore_partial_common<Packet4f>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
{
pstore_partial_common<Packet4i>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
{
pstore_partial_common<Packet8s>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
{
pstore_partial_common<Packet8us>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
{
pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
{
pstore_partial_common<Packet16c>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
{
pstore_partial_common<Packet16uc>(to, from, n, offset);
}
template<typename Packet> template<typename Packet>
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
{ {
@ -596,168 +798,167 @@ pbroadcast4<Packet4i>(const int *a,
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3); pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
a[0] = from[0*stride]; eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
a[1] = from[1*stride]; LOAD_STORE_UNROLL_16
a[2] = from[2*stride]; for (Index i = 0; i < n; i++) {
a[3] = from[3*stride]; a[i] = from[i*stride];
return pload<Packet>(a); }
// Leave rest of the array uninitialized
return pload_ignore<Packet>(a);
} }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{ {
return pgather_common<Packet4f>(from, stride); return pgather_common<Packet4f>(from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{ {
return pgather_common<Packet4i>(from, stride); return pgather_common<Packet4i>(from, stride);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; return pgather_common<Packet8s>(from, stride);
a[0] = from[0*stride];
a[1] = from[1*stride];
a[2] = from[2*stride];
a[3] = from[3*stride];
a[4] = from[4*stride];
a[5] = from[5*stride];
a[6] = from[6*stride];
a[7] = from[7*stride];
return pload<Packet>(a);
} }
template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
{ {
return pgather_size8<Packet8s>(from, stride); return pgather_common<Packet8us>(from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
{ {
return pgather_size8<Packet8us>(from, stride); return pgather_common<Packet8bf>(from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
{ {
return pgather_size8<Packet8bf>(from, stride); return pgather_common<Packet16c>(from, stride);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; return pgather_common<Packet16uc>(from, stride);
a[0] = from[0*stride];
a[1] = from[1*stride];
a[2] = from[2*stride];
a[3] = from[3*stride];
a[4] = from[4*stride];
a[5] = from[5*stride];
a[6] = from[6*stride];
a[7] = from[7*stride];
a[8] = from[8*stride];
a[9] = from[9*stride];
a[10] = from[10*stride];
a[11] = from[11*stride];
a[12] = from[12*stride];
a[13] = from[13*stride];
a[14] = from[14*stride];
a[15] = from[15*stride];
return pload<Packet>(a);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
{ {
return pgather_size16<Packet16c>(from, stride); return pgather_common<Packet4f>(from, stride, n);
} }
template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
{ {
return pgather_size16<Packet16uc>(from, stride); return pgather_common<Packet4i>(from, stride, n);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; return pgather_common<Packet8s>(from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
{
return pgather_common<Packet8us>(from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
{
return pgather_common<Packet8bf>(from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
{
return pgather_common<Packet16c>(from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
{
return pgather_common<Packet16uc>(from, stride, n);
}
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
{
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
pstore<__UNPACK_TYPE__(Packet)>(a, from); pstore<__UNPACK_TYPE__(Packet)>(a, from);
to[0*stride] = a[0]; LOAD_STORE_UNROLL_16
to[1*stride] = a[1]; for (Index i = 0; i < n; i++) {
to[2*stride] = a[2]; to[i*stride] = a[i];
to[3*stride] = a[3]; }
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{ {
pscatter_size4<Packet4f>(to, from, stride); pscatter_common<Packet4f>(to, from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{ {
pscatter_size4<Packet4i>(to, from, stride); pscatter_common<Packet4i>(to, from, stride);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; pscatter_common<Packet8s>(to, from, stride);
pstore<__UNPACK_TYPE__(Packet)>(a, from);
to[0*stride] = a[0];
to[1*stride] = a[1];
to[2*stride] = a[2];
to[3*stride] = a[3];
to[4*stride] = a[4];
to[5*stride] = a[5];
to[6*stride] = a[6];
to[7*stride] = a[7];
} }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
{ {
pscatter_size8<Packet8s>(to, from, stride); pscatter_common<Packet8us>(to, from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
{ {
pscatter_size8<Packet8us>(to, from, stride); pscatter_common<Packet8bf>(to, from, stride);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
{ {
pscatter_size8<Packet8bf>(to, from, stride); pscatter_common<Packet16c>(to, from, stride);
} }
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
{ {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; pscatter_common<Packet16uc>(to, from, stride);
pstore<__UNPACK_TYPE__(Packet)>(a, from);
to[0*stride] = a[0];
to[1*stride] = a[1];
to[2*stride] = a[2];
to[3*stride] = a[3];
to[4*stride] = a[4];
to[5*stride] = a[5];
to[6*stride] = a[6];
to[7*stride] = a[7];
to[8*stride] = a[8];
to[9*stride] = a[9];
to[10*stride] = a[10];
to[11*stride] = a[11];
to[12*stride] = a[12];
to[13*stride] = a[13];
to[14*stride] = a[14];
to[15*stride] = a[15];
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
{ {
pscatter_size16<Packet16c>(to, from, stride); pscatter_common<Packet4f>(to, from, stride, n);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
{ {
pscatter_size16<Packet16uc>(to, from, stride); pscatter_common<Packet4i>(to, from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
{
pscatter_common<Packet8s>(to, from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
{
pscatter_common<Packet8us>(to, from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
{
pscatter_common<Packet8bf>(to, from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
{
pscatter_common<Packet16c>(to, from, stride, n);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
{
pscatter_common<Packet16uc>(to, from, stride, n);
} }
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
@ -1008,6 +1209,73 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char
return ploadu_common<Packet16uc>(from); return ploadu_common<Packet16uc>(from);
} }
template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
#else
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size;
Index i = 0;
if (16 <= n2) {
pstore(load2, ploadu<Packet16uc>(from2));
i += 16;
}
if (i + 8 <= n2) {
*reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
i += 8;
}
if (i + 4 <= n2) {
*reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
i += 4;
}
if (i + 2 <= n2) {
*reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
i += 2;
}
if (i < n2) {
*reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
}
return pload_ignore<Packet>(load);
#endif
}
template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n)
{
return ploadu_partial_common<Packet4f>(from, n);
}
template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n)
{
return ploadu_partial_common<Packet4i>(from, n);
}
template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n)
{
return ploadu_partial_common<Packet8s>(from, n);
}
template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n)
{
return ploadu_partial_common<Packet8us>(from, n);
}
template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n)
{
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n);
}
template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n)
{
return ploadu_partial_common<Packet16c>(from, n);
}
template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n)
{
return ploadu_partial_common<Packet16uc>(from, n);
}
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
{ {
Packet p; Packet p;
@ -1128,6 +1396,77 @@ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* t
pstoreu_common<Packet16uc>(to, from); pstoreu_common<Packet16uc>(to, from);
} }
template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_DEBUG_UNALIGNED_STORE
vec_xst_len(from, to, n * size);
#else
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from);
unsigned char* store2 = reinterpret_cast<unsigned char *>(store);
unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
Index n2 = n * size;
Index i = 0;
if (16 <= n2) {
pstoreu(to2, pload<Packet16uc>(store2));
i += 16;
}
if (i + 8 <= n2) {
*reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
i += 8;
}
if (i + 4 <= n2) {
*reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
i += 4;
}
if (i + 2 <= n2) {
*reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
i += 2;
}
if (i < n2) {
*reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
}
LOAD_STORE_UNROLL_16
for (Index i = 0; i < n; i++) {
to[i] = from[i];
}
#endif
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n)
{
pstoreu_partial_common<Packet4f>(to, from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n)
{
pstoreu_partial_common<Packet4i>(to, from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n)
{
pstoreu_partial_common<Packet8s>(to, from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n)
{
pstoreu_partial_common<Packet8us>(to, from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n)
{
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n)
{
pstoreu_partial_common<Packet16c>(to, from, n);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n)
{
pstoreu_partial_common<Packet16uc>(to, from, n);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
@ -2387,12 +2726,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
} }
template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
{
return pload_partial_common<Packet2d>(from, n, offset);
}
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
{ {
EIGEN_DEBUG_ALIGNED_STORE EIGEN_DEBUG_ALIGNED_STORE
vec_xst(from, 0, to); vec_xst(from, 0, to);
} }
template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
{
pstore_partial_common<Packet2d>(to, from, n, offset);
}
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
Packet2d v = {from, from}; Packet2d v = {from, from};
return v; return v;
@ -2414,19 +2763,21 @@ pbroadcast4<Packet2d>(const double *a,
a3 = pset1<Packet2d>(a[3]); a3 = pset1<Packet2d>(a[3]);
} }
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{ {
EIGEN_ALIGN16 double af[2]; return pgather_common<Packet2d>(from, stride);
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2d>(af);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
{ {
EIGEN_ALIGN16 double af[2]; return pgather_common<Packet2d>(from, stride, n);
pstore<double>(af, from); }
to[0*stride] = af[0]; template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
to[1*stride] = af[1]; {
pscatter_common<Packet2d>(to, from, stride);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
{
pscatter_common<Packet2d>(to, from, stride, n);
} }
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
@ -2517,6 +2868,11 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
return vec_xl(0, const_cast<double*>(from)); return vec_xl(0, const_cast<double*>(from));
} }
template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n)
{
return ploadu_partial_common<Packet2d>(from, n);
}
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{ {
Packet2d p; Packet2d p;
@ -2531,6 +2887,11 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
vec_xst(from, 0, to); vec_xst(from, 0, to);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n)
{
pstoreu_partial_common<Packet2d>(to, from, n);
}
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }

View File

@ -100,6 +100,11 @@ public:
return ploadt<PacketType, AlignmentType>(m_data + i); return ploadt<PacketType, AlignmentType>(m_data + i);
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const {
return ploadt_partial<PacketType, AlignmentType>(m_data + i, n, offset);
}
template<typename PacketType, int AlignmentT> template<typename PacketType, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
return ploadt<PacketType, AlignmentT>(m_data + i); return ploadt<PacketType, AlignmentT>(m_data + i);
@ -110,6 +115,11 @@ public:
pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p); pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const {
pstoret_partial<Scalar, PacketType, AlignmentType>(m_data + i, p, n, offset);
}
protected: protected:
Scalar *m_data; Scalar *m_data;
}; };
@ -208,6 +218,11 @@ public:
return ploadt<PacketType, AlignmentType>(&operator()(i, j)); return ploadt<PacketType, AlignmentType>(&operator()(i, j));
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const {
return ploadt_partial<PacketType, AlignmentType>(&operator()(i, j), n, offset);
}
template <typename PacketT, int AlignmentT> template <typename PacketT, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
return ploadt<PacketT, AlignmentT>(&operator()(i, j)); return ploadt<PacketT, AlignmentT>(&operator()(i, j));
@ -218,6 +233,11 @@ public:
pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p); pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const {
pstoret_partial<Scalar, PacketType, AlignmentType>(&operator()(i, j), p, n, offset);
}
template<typename SubPacket> template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride); pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
@ -271,11 +291,21 @@ public:
return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value()); return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const {
return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n);
}
template<typename PacketType> template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value()); pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const {
pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n);
}
protected: protected:
Scalar *m_data; Scalar *m_data;
const internal::variable_if_dynamic<Index,Incr> m_incr; const internal::variable_if_dynamic<Index,Incr> m_incr;
@ -312,6 +342,11 @@ public:
return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value()); return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const {
return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n);
}
template <typename PacketT, int AlignmentT> template <typename PacketT, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value()); return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
@ -322,6 +357,11 @@ public:
pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value()); pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
} }
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const {
pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
}
template<typename SubPacket> template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride); pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);

View File

@ -458,6 +458,36 @@ void packetmath() {
VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu"); VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
} }
for (int M = 0; M < PacketSize; ++M) {
for (int N = 0; N <= PacketSize; ++N) {
for (int j = 0; j < size; ++j) {
data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
refvalue = (std::max)(refvalue, numext::abs(data1[j]));
}
if (M == 0) {
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N), N);
VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN");
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstore_partial(data2, internal::ploadu_partial<Packet>(data1 + offset, N), N);
VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial");
}
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstoreu_partial(data2 + offset, internal::pload_partial<Packet>(data1, N), N);
VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial");
}
}
if (N + M > PacketSize) continue; // Don't read or write past end of Packet
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N, M), N, M);
VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN");
}
}
if (internal::unpacket_traits<Packet>::masked_load_available) { if (internal::unpacket_traits<Packet>::masked_load_available) {
test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h; test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
unsigned long long max_umask = (0x1ull << PacketSize); unsigned long long max_umask = (0x1ull << PacketSize);
@ -1372,6 +1402,36 @@ void packetmath_scatter_gather() {
for (int i = 0; i < PacketSize; ++i) { for (int i = 0; i < PacketSize; ++i) {
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather"); VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
} }
for (Index N = 0; N <= PacketSize; ++N) {
for (Index i = 0; i < N; ++i) {
data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
}
for (Index i = 0; i < N * 20; ++i) {
buffer[i] = Scalar(0);
}
packet = internal::pload_partial<Packet>(data1, N);
internal::pscatter_partial<Scalar, Packet>(buffer, packet, stride, N);
for (Index i = 0; i < N * 20; ++i) {
if ((i % stride) == 0 && i < stride * N) {
VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial");
} else {
VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial");
}
}
for (Index i = 0; i < N * 7; ++i) {
buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
}
packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
internal::pstore_partial(data1, packet, N);
for (Index i = 0; i < N; ++i) {
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial");
}
}
} }
namespace Eigen { namespace Eigen {