Add optional offset parameter to ploadu_partial and pstoreu_partial

This commit is contained in:
Chip Kerchner 2023-06-23 19:53:05 +00:00 committed by Charles Schlosser
parent 44c20bbbe3
commit 211c5dfc67
3 changed files with 85 additions and 65 deletions

View File

@ -730,14 +730,14 @@ ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
* All elements after the last element loaded will initialized with zero */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n)
ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will read past end of packet");
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
typedef typename unpacket_traits<Packet>::type Scalar;
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
elements[i] = from[i];
for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
elements[i] = from[i-offset];
}
return pload<Packet>(elements);
}
@ -855,14 +855,14 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
{ (*to) = from; }
/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n)
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_assert(n <= packet_size && "number of elements will write past end of packet");
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
EIGEN_ALIGN_MAX Scalar elements[packet_size];
pstore<Scalar>(elements, from);
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
to[i] = elements[i];
for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
to[i] = elements[i + offset];
}
}
@ -1201,7 +1201,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpac
if(Alignment >= unpacket_traits<Packet>::alignment)
return pload_partial<Packet>(from, n, offset);
else
return ploadu_partial<Packet>(from, n);
return ploadu_partial<Packet>(from, n, offset);
}
/** \internal copy the packet \a from to \a *to.
@ -1223,7 +1223,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Pac
if(Alignment >= unpacket_traits<Packet>::alignment)
pstore_partial(to, from, n, offset);
else
pstoreu_partial(to, from, n);
pstoreu_partial(to, from, n, offset);
}
/** \internal \returns a packet version of \a *from.

View File

@ -136,16 +136,16 @@ template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::com
{
return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
}
template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
{
return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2));
return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2, offset * 2));
}
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstoreu_partial((float*)to, from.v, n * 2, offset * 2); }
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{
@ -382,14 +382,14 @@ template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::com
{
return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
}
template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
{
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2));
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2, offset * 2));
}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); }
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstoreu_partial((double*)to, from.v, n * 2, offset * 2); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

View File

@ -1260,26 +1260,36 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char
return ploadu_common<Packet16uc>(from);
}
template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n <= packet_size && "number of elements will read past end of packet");
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
load = Packet(vec_sro(Packet16uc(load), shift));
#else
load = Packet(vec_slo(Packet16uc(load), shift));
#endif
}
return load;
#else
if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size;
if (16 <= n2) {
return ploadu<Packet>(from);
pstoreu(load2, ploadu<Packet16uc>(from2));
} else {
memcpy((void *)load2, (void *)from2, n2);
}
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
memcpy((void *)load2, (void *)from2, n2);
return pload_ignore<Packet>(load);
} else {
return Packet(pset1<Packet16uc>(0));
@ -1287,33 +1297,33 @@ template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const
#endif
}
template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet4f>(from, n);
return ploadu_partial_common<Packet4f>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet4i>(from, n);
return ploadu_partial_common<Packet4i>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet8s>(from, n);
return ploadu_partial_common<Packet8s>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet8us>(from, n);
return ploadu_partial_common<Packet8us>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n);
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet16c>(from, n);
return ploadu_partial_common<Packet16c>(from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet16uc>(from, n);
return ploadu_partial_common<Packet16uc>(from, n, offset);
}
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
@ -1436,57 +1446,67 @@ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* t
pstoreu_common<Packet16uc>(to, from);
}
template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n)
template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
{
const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n <= packet_size && "number of elements will write past end of packet");
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
#ifdef _ARCH_PWR9
EIGEN_UNUSED_VARIABLE(packet_size);
EIGEN_DEBUG_UNALIGNED_STORE
vec_xst_len(from, to, n * size);
Packet store = from;
if (offset) {
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
#ifdef _BIG_ENDIAN
store = Packet(vec_slo(Packet16uc(store), shift));
#else
store = Packet(vec_sro(Packet16uc(store), shift));
#endif
}
vec_xst_len(store, to, n * size);
#else
if (n) {
Index n2 = n * size;
if (16 <= n2) {
pstoreu(to, from);
}
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from);
unsigned char* store2 = reinterpret_cast<unsigned char *>(store);
unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
memcpy((void *)to2, (void *)store2, n2);
Index n2 = n * size;
if (16 <= n2) {
pstoreu(to2, ploadu<Packet16uc>(store2));
} else {
memcpy((void *)to2, (void *)store2, n2);
}
}
#endif
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet4f>(to, from, n);
pstoreu_partial_common<Packet4f>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet4i>(to, from, n);
pstoreu_partial_common<Packet4i>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet8s>(to, from, n);
pstoreu_partial_common<Packet8s>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet8us>(to, from, n);
pstoreu_partial_common<Packet8us>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n);
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet16c>(to, from, n);
pstoreu_partial_common<Packet16c>(to, from, n, offset);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet16uc>(to, from, n);
pstoreu_partial_common<Packet16uc>(to, from, n, offset);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
@ -2953,9 +2973,9 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
return vec_xl(0, const_cast<double*>(from));
}
template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n)
template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset)
{
return ploadu_partial_common<Packet2d>(from, n);
return ploadu_partial_common<Packet2d>(from, n, offset);
}
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
@ -2972,9 +2992,9 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
vec_xst(from, 0, to);
}
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n)
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
{
pstoreu_partial_common<Packet2d>(to, from, n);
pstoreu_partial_common<Packet2d>(to, from, n, offset);
}
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }