Add optional offset parameter to ploadu_partial and pstoreu_partial

This commit is contained in:
Chip Kerchner 2023-06-23 19:53:05 +00:00 committed by Charles Schlosser
parent 44c20bbbe3
commit 211c5dfc67
3 changed files with 85 additions and 65 deletions

View File

@ -730,14 +730,14 @@ ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns n elements of a packet version of \a *from, (un-aligned load) /** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
* All elements after the last element loaded will initialized with zero */ * All elements after the last element loaded will initialized with zero */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n) ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
{ {
const Index packet_size = unpacket_traits<Packet>::size; const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will read past end of packet"); eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
typedef typename unpacket_traits<Packet>::type Scalar; typedef typename unpacket_traits<Packet>::type Scalar;
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
for (Index i = 0; i < numext::mini(n,packet_size); i++) { for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
elements[i] = from[i]; elements[i] = from[i-offset];
} }
return pload<Packet>(elements); return pload<Packet>(elements);
} }
@ -855,14 +855,14 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
{ (*to) = from; } { (*to) = from; }
/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */ /** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
{ {
const Index packet_size = unpacket_traits<Packet>::size; const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will write past end of packet"); eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
EIGEN_ALIGN_MAX Scalar elements[packet_size]; EIGEN_ALIGN_MAX Scalar elements[packet_size];
pstore<Scalar>(elements, from); pstore<Scalar>(elements, from);
for (Index i = 0; i < numext::mini(n,packet_size); i++) { for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
to[i] = elements[i]; to[i] = elements[i + offset];
} }
} }
@ -1201,7 +1201,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpac
if(Alignment >= unpacket_traits<Packet>::alignment) if(Alignment >= unpacket_traits<Packet>::alignment)
return pload_partial<Packet>(from, n, offset); return pload_partial<Packet>(from, n, offset);
else else
return ploadu_partial<Packet>(from, n); return ploadu_partial<Packet>(from, n, offset);
} }
/** \internal copy the packet \a from to \a *to. /** \internal copy the packet \a from to \a *to.
@ -1223,7 +1223,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Pac
if(Alignment >= unpacket_traits<Packet>::alignment) if(Alignment >= unpacket_traits<Packet>::alignment)
pstore_partial(to, from, n, offset); pstore_partial(to, from, n, offset);
else else
pstoreu_partial(to, from, n); pstoreu_partial(to, from, n, offset);
} }
/** \internal \returns a packet version of \a *from. /** \internal \returns a packet version of \a *from.

View File

@ -136,16 +136,16 @@ template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::com
{ {
return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2)); return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
} }
template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
{ {
return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2)); return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2, offset * 2));
} }
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); } template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); } template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstoreu_partial((float*)to, from.v, n * 2, offset * 2); }
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1) EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{ {
@ -382,14 +382,14 @@ template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::com
{ {
return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2)); return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
} }
template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
{ {
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2)); return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2, offset * 2));
} }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); } template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); } template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstoreu_partial((double*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

View File

@ -1260,26 +1260,36 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char
return ploadu_common<Packet16uc>(from); return ploadu_common<Packet16uc>(from);
} }
template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n) template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
{ {
const Index packet_size = unpacket_traits<Packet>::size; const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n <= packet_size && "number of elements will read past end of packet"); eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_ALIGNED_LOAD EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
load = Packet(vec_sro(Packet16uc(load), shift));
#else
load = Packet(vec_slo(Packet16uc(load), shift));
#endif
}
return load;
#else #else
if (n) { if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size; Index n2 = n * size;
if (16 <= n2) { if (16 <= n2) {
return ploadu<Packet>(from); pstoreu(load2, ploadu<Packet16uc>(from2));
} else {
memcpy((void *)load2, (void *)from2, n2);
} }
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
memcpy((void *)load2, (void *)from2, n2);
return pload_ignore<Packet>(load); return pload_ignore<Packet>(load);
} else { } else {
return Packet(pset1<Packet16uc>(0)); return Packet(pset1<Packet16uc>(0));
@ -1287,33 +1297,33 @@ template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const
#endif #endif
} }
template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet4f>(from, n); return ploadu_partial_common<Packet4f>(from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet4i>(from, n); return ploadu_partial_common<Packet4i>(from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet8s>(from, n); return ploadu_partial_common<Packet8s>(from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet8us>(from, n); return ploadu_partial_common<Packet8us>(from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n); return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet16c>(from, n); return ploadu_partial_common<Packet16c>(from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet16uc>(from, n); return ploadu_partial_common<Packet16uc>(from, n, offset);
} }
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
@ -1436,57 +1446,67 @@ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* t
pstoreu_common<Packet16uc>(to, from); pstoreu_common<Packet16uc>(to, from);
} }
template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n) template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
{ {
const Index packet_size = unpacket_traits<Packet>::size; const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n <= packet_size && "number of elements will write past end of packet"); eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet)); const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9 #ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_UNALIGNED_STORE EIGEN_DEBUG_UNALIGNED_STORE
vec_xst_len(from, to, n * size); Packet store = from;
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
store = Packet(vec_slo(Packet16uc(store), shift));
#else
store = Packet(vec_sro(Packet16uc(store), shift));
#endif
}
vec_xst_len(store, to, n * size);
#else #else
if (n) { if (n) {
Index n2 = n * size;
if (16 <= n2) {
pstoreu(to, from);
}
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from); pstore(store, from);
unsigned char* store2 = reinterpret_cast<unsigned char *>(store); unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
unsigned char* to2 = reinterpret_cast<unsigned char *>(to); unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
memcpy((void *)to2, (void *)store2, n2); Index n2 = n * size;
if (16 <= n2) {
pstoreu(to2, ploadu<Packet16uc>(store2));
} else {
memcpy((void *)to2, (void *)store2, n2);
}
} }
#endif #endif
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet4f>(to, from, n); pstoreu_partial_common<Packet4f>(to, from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet4i>(to, from, n); pstoreu_partial_common<Packet4i>(to, from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet8s>(to, from, n); pstoreu_partial_common<Packet8s>(to, from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet8us>(to, from, n); pstoreu_partial_common<Packet8us>(to, from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n); pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet16c>(to, from, n); pstoreu_partial_common<Packet16c>(to, from, n, offset);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet16uc>(to, from, n); pstoreu_partial_common<Packet16uc>(to, from, n, offset);
} }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
@ -2953,9 +2973,9 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
return vec_xl(0, const_cast<double*>(from)); return vec_xl(0, const_cast<double*>(from));
} }
template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n) template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset)
{ {
return ploadu_partial_common<Packet2d>(from, n); return ploadu_partial_common<Packet2d>(from, n, offset);
} }
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
@ -2972,9 +2992,9 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
vec_xst(from, 0, to); vec_xst(from, 0, to);
} }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n) template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
{ {
pstoreu_partial_common<Packet2d>(to, from, n); pstoreu_partial_common<Packet2d>(to, from, n, offset);
} }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }