mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-21 09:09:36 +08:00
Add pload_partial, pstore_partial (and unaligned versions), pgather_partial, pscatter_partial, loadPacketPartial and storePacketPartial.
This commit is contained in:
parent
c603275dc9
commit
84cf3ff18d
@ -606,14 +606,46 @@ pldexp(const Packet &a, const Packet &exponent) {
|
|||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
|
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
|
||||||
|
|
||||||
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
|
/** \internal \returns a packet version of \a *from, from must be properly aligned */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
|
/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned
|
||||||
|
* offset indicates the starting element in which to load and
|
||||||
|
* offset + n <= unpacket_traits::size
|
||||||
|
* All elements before offset and after the last element loaded will initialized with zero */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pload_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
|
||||||
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
|
||||||
|
for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) {
|
||||||
|
elements[i] = from[i-offset];
|
||||||
|
}
|
||||||
|
return pload<Packet>(elements);
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \returns a packet version of \a *from, (un-aligned load) */
|
/** \internal \returns a packet version of \a *from, (un-aligned load) */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
|
/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
|
||||||
|
* All elements after the last element loaded will initialized with zero */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n <= packet_size && "number of elements will read past end of packet");
|
||||||
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
|
||||||
|
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
|
||||||
|
elements[i] = from[i];
|
||||||
|
}
|
||||||
|
return pload<Packet>(elements);
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \returns a packet version of \a *from, (un-aligned masked load)
|
/** \internal \returns a packet version of \a *from, (un-aligned masked load)
|
||||||
* There is no generic implementation. We only have implementations for specialized
|
* There is no generic implementation. We only have implementations for specialized
|
||||||
* cases. Generic case should not be called.
|
* cases. Generic case should not be called.
|
||||||
@ -704,14 +736,40 @@ peven_mask(const Packet& /*a*/) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
|
/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
|
||||||
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
|
||||||
{ (*to) = from; }
|
{ (*to) = from; }
|
||||||
|
|
||||||
|
/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned
|
||||||
|
* offset indicates the starting element in which to store and
|
||||||
|
* offset + n <= unpacket_traits::size */
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size];
|
||||||
|
pstore<Scalar>(elements, from);
|
||||||
|
for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) {
|
||||||
|
to[i] = elements[i + offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
|
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
|
||||||
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
|
||||||
{ (*to) = from; }
|
{ (*to) = from; }
|
||||||
|
|
||||||
|
/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n <= packet_size && "number of elements will write past end of packet");
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size];
|
||||||
|
pstore<Scalar>(elements, from);
|
||||||
|
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
|
||||||
|
to[i] = elements[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
|
/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
|
||||||
* There is no generic implementation. We only have implementations for specialized
|
* There is no generic implementation. We only have implementations for specialized
|
||||||
* cases. Generic case should not be called.
|
* cases. Generic case should not be called.
|
||||||
@ -721,11 +779,31 @@ EIGEN_DEVICE_FUNC inline
|
|||||||
std::enable_if_t<unpacket_traits<Packet>::masked_store_available, void>
|
std::enable_if_t<unpacket_traits<Packet>::masked_store_available, void>
|
||||||
pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
|
pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
|
||||||
|
|
||||||
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
|
||||||
{ return ploadu<Packet>(from); }
|
{ return ploadu<Packet>(from); }
|
||||||
|
|
||||||
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n)
|
||||||
{ pstore(to, from); }
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) };
|
||||||
|
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
|
||||||
|
elements[i] = from[i*stride];
|
||||||
|
}
|
||||||
|
return pload<Packet>(elements);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
|
||||||
|
{ pstore(to, from); }
|
||||||
|
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
EIGEN_ALIGN_MAX Scalar elements[packet_size];
|
||||||
|
pstore<Scalar>(elements, from);
|
||||||
|
for (Index i = 0; i < numext::mini(n,packet_size); i++) {
|
||||||
|
to[i*stride] = elements[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal tries to do cache prefetching of \a addr */
|
/** \internal tries to do cache prefetching of \a addr */
|
||||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||||
@ -996,6 +1074,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_trai
|
|||||||
return ploadu<Packet>(from);
|
return ploadu<Packet>(from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** \internal \returns n elements of a packet version of \a *from.
|
||||||
|
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
||||||
|
template<typename Packet, int Alignment>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits<Packet>::type* from, const Index n, const Index offset = 0)
|
||||||
|
{
|
||||||
|
if(Alignment >= unpacket_traits<Packet>::alignment)
|
||||||
|
return pload_partial<Packet>(from, n, offset);
|
||||||
|
else
|
||||||
|
return ploadu_partial<Packet>(from, n);
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal copy the packet \a from to \a *to.
|
/** \internal copy the packet \a from to \a *to.
|
||||||
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
||||||
template<typename Scalar, typename Packet, int Alignment>
|
template<typename Scalar, typename Packet, int Alignment>
|
||||||
@ -1007,6 +1096,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
|
|||||||
pstoreu(to, from);
|
pstoreu(to, from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** \internal copy n elements of the packet \a from to \a *to.
|
||||||
|
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
||||||
|
template<typename Scalar, typename Packet, int Alignment>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0)
|
||||||
|
{
|
||||||
|
if(Alignment >= unpacket_traits<Packet>::alignment)
|
||||||
|
pstore_partial(to, from, n, offset);
|
||||||
|
else
|
||||||
|
pstoreu_partial(to, from, n);
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \returns a packet version of \a *from.
|
/** \internal \returns a packet version of \a *from.
|
||||||
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
|
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
|
||||||
* hardware if available to speedup the loading of data that won't be modified
|
* hardware if available to speedup the loading of data that won't be modified
|
||||||
|
@ -132,10 +132,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n)
|
||||||
|
{
|
||||||
|
return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2));
|
||||||
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
|
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
|
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
|
||||||
{
|
{
|
||||||
@ -157,19 +167,46 @@ EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std
|
|||||||
return Packet2cf(res0);
|
return Packet2cf(res0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
|
template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 std::complex<float> af[2];
|
Packet2cf res;
|
||||||
af[0] = from[0*stride];
|
res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
|
||||||
af[1] = from[1*stride];
|
return res;
|
||||||
return pload<Packet2cf>(af);
|
|
||||||
}
|
}
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
|
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 std::complex<float> af[2];
|
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
|
||||||
pstore<std::complex<float> >((std::complex<float> *) af, from);
|
EIGEN_ALIGN16 Scalar af[2];
|
||||||
to[0*stride] = af[0];
|
for (Index i = 0; i < n; i++) {
|
||||||
to[1*stride] = af[1];
|
af[i] = from[i*stride];
|
||||||
|
}
|
||||||
|
return pload_ignore<Packet>(af);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
|
||||||
|
{
|
||||||
|
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
|
||||||
|
}
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
|
||||||
|
{
|
||||||
|
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
|
||||||
|
EIGEN_ALIGN16 Scalar af[2];
|
||||||
|
pstore<Scalar>((Scalar *) af, from);
|
||||||
|
for (Index i = 0; i < n; i++) {
|
||||||
|
to[i*stride] = af[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
|
||||||
|
{
|
||||||
|
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
|
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
|
||||||
@ -336,17 +373,35 @@ template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
|
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
|
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n)
|
||||||
|
{
|
||||||
|
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2));
|
||||||
|
}
|
||||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
|
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
|
||||||
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
|
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
|
||||||
{
|
{
|
||||||
return pload<Packet1cd>(from);
|
return pload<Packet1cd>(from);
|
||||||
}
|
}
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
|
||||||
|
{
|
||||||
|
return pload<Packet1cd>(from);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
|
||||||
|
{
|
||||||
|
pstore<std::complex<double> >(to, from);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
|
||||||
{
|
{
|
||||||
pstore<std::complex<double> >(to, from);
|
pstore<std::complex<double> >(to, from);
|
||||||
}
|
}
|
||||||
|
@ -143,6 +143,12 @@ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32
|
|||||||
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
|
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if EIGEN_COMP_LLVM
|
||||||
|
#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
|
||||||
|
#else
|
||||||
|
#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
|
||||||
|
#endif
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct packet_traits<float> : default_packet_traits {
|
struct packet_traits<float> : default_packet_traits {
|
||||||
typedef Packet4f type;
|
typedef Packet4f type;
|
||||||
@ -471,6 +477,118 @@ template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* fr
|
|||||||
return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
|
||||||
|
{
|
||||||
|
// some versions of GCC throw "unused-but-set-parameter".
|
||||||
|
// ignoring these warnings for now.
|
||||||
|
EIGEN_UNUSED_VARIABLE(from);
|
||||||
|
EIGEN_DEBUG_ALIGNED_LOAD
|
||||||
|
// Ignore partial input memory initialized
|
||||||
|
#if !EIGEN_COMP_LLVM
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||||
|
#endif
|
||||||
|
#ifdef __VSX__
|
||||||
|
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
||||||
|
#else
|
||||||
|
return vec_ld(0, from);
|
||||||
|
#endif
|
||||||
|
#if !EIGEN_COMP_LLVM
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from)
|
||||||
|
{
|
||||||
|
return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
// some versions of GCC throw "unused-but-set-parameter".
|
||||||
|
// ignoring these warnings for now.
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
|
||||||
|
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
EIGEN_DEBUG_ALIGNED_LOAD
|
||||||
|
EIGEN_UNUSED_VARIABLE(from);
|
||||||
|
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
|
||||||
|
if (offset) {
|
||||||
|
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
||||||
|
#ifdef _BIG_ENDIAN
|
||||||
|
load = Packet(vec_sro(Packet16uc(load), shift));
|
||||||
|
#else
|
||||||
|
load = Packet(vec_slo(Packet16uc(load), shift));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return load;
|
||||||
|
#else
|
||||||
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
|
||||||
|
unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
|
||||||
|
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
||||||
|
Index n2 = n * size;
|
||||||
|
Index i = 0;
|
||||||
|
if (16 <= n2) {
|
||||||
|
pstoreu(load2, ploadu<Packet16uc>(from2));
|
||||||
|
i += 16;
|
||||||
|
}
|
||||||
|
if (i + 8 <= n2) {
|
||||||
|
*reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
|
||||||
|
i += 8;
|
||||||
|
}
|
||||||
|
if (i + 4 <= n2) {
|
||||||
|
*reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
if (i + 2 <= n2) {
|
||||||
|
*reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
if (i < n2) {
|
||||||
|
*reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
|
||||||
|
}
|
||||||
|
return pload_ignore<Packet>(load);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet4f>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet4i>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet8s>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet8us>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet16c>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet16uc>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Packet>
|
template <typename Packet>
|
||||||
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
|
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
|
||||||
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
||||||
@ -519,6 +637,90 @@ template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* t
|
|||||||
pstore_common<Packet16uc>(to, from);
|
pstore_common<Packet16uc>(to, from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
||||||
|
// ignoring these warnings for now.
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
|
||||||
|
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
EIGEN_UNUSED_VARIABLE(to);
|
||||||
|
EIGEN_DEBUG_ALIGNED_STORE
|
||||||
|
Packet store = from;
|
||||||
|
if (offset) {
|
||||||
|
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
||||||
|
#ifdef _BIG_ENDIAN
|
||||||
|
store = Packet(vec_slo(Packet16uc(store), shift));
|
||||||
|
#else
|
||||||
|
store = Packet(vec_sro(Packet16uc(store), shift));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
vec_xst_len(store, to, n * size);
|
||||||
|
#else
|
||||||
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
|
||||||
|
pstore(store, from);
|
||||||
|
unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
|
||||||
|
unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
|
||||||
|
Index n2 = n * size;
|
||||||
|
Index i = 0;
|
||||||
|
if (16 <= n2) {
|
||||||
|
pstore(to2, ploadu<Packet16uc>(store2));
|
||||||
|
i += 16;
|
||||||
|
}
|
||||||
|
if (i + 8 <= n2) {
|
||||||
|
*reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
|
||||||
|
i += 8;
|
||||||
|
}
|
||||||
|
if (i + 4 <= n2) {
|
||||||
|
*reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
if (i + 2 <= n2) {
|
||||||
|
*reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
if (i < n2) {
|
||||||
|
*reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet4f>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet4i>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet8s>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet8us>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet16c>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet16uc>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Packet>
|
template<typename Packet>
|
||||||
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
|
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
|
||||||
{
|
{
|
||||||
@ -596,168 +798,167 @@ pbroadcast4<Packet4i>(const int *a,
|
|||||||
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
|
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
|
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
|
||||||
a[0] = from[0*stride];
|
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
|
||||||
a[1] = from[1*stride];
|
LOAD_STORE_UNROLL_16
|
||||||
a[2] = from[2*stride];
|
for (Index i = 0; i < n; i++) {
|
||||||
a[3] = from[3*stride];
|
a[i] = from[i*stride];
|
||||||
return pload<Packet>(a);
|
}
|
||||||
|
// Leave rest of the array uninitialized
|
||||||
|
return pload_ignore<Packet>(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
||||||
{
|
{
|
||||||
return pgather_common<Packet4f>(from, stride);
|
return pgather_common<Packet4f>(from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
|
||||||
{
|
{
|
||||||
return pgather_common<Packet4i>(from, stride);
|
return pgather_common<Packet4i>(from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
|
return pgather_common<Packet8s>(from, stride);
|
||||||
a[0] = from[0*stride];
|
|
||||||
a[1] = from[1*stride];
|
|
||||||
a[2] = from[2*stride];
|
|
||||||
a[3] = from[3*stride];
|
|
||||||
a[4] = from[4*stride];
|
|
||||||
a[5] = from[5*stride];
|
|
||||||
a[6] = from[6*stride];
|
|
||||||
a[7] = from[7*stride];
|
|
||||||
return pload<Packet>(a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
|
||||||
{
|
{
|
||||||
return pgather_size8<Packet8s>(from, stride);
|
return pgather_common<Packet8us>(from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
|
||||||
{
|
{
|
||||||
return pgather_size8<Packet8us>(from, stride);
|
return pgather_common<Packet8bf>(from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
|
||||||
{
|
{
|
||||||
return pgather_size8<Packet8bf>(from, stride);
|
return pgather_common<Packet16c>(from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
|
return pgather_common<Packet16uc>(from, stride);
|
||||||
a[0] = from[0*stride];
|
|
||||||
a[1] = from[1*stride];
|
|
||||||
a[2] = from[2*stride];
|
|
||||||
a[3] = from[3*stride];
|
|
||||||
a[4] = from[4*stride];
|
|
||||||
a[5] = from[5*stride];
|
|
||||||
a[6] = from[6*stride];
|
|
||||||
a[7] = from[7*stride];
|
|
||||||
a[8] = from[8*stride];
|
|
||||||
a[9] = from[9*stride];
|
|
||||||
a[10] = from[10*stride];
|
|
||||||
a[11] = from[11*stride];
|
|
||||||
a[12] = from[12*stride];
|
|
||||||
a[13] = from[13*stride];
|
|
||||||
a[14] = from[14*stride];
|
|
||||||
a[15] = from[15*stride];
|
|
||||||
return pload<Packet>(a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
|
|
||||||
{
|
{
|
||||||
return pgather_size16<Packet16c>(from, stride);
|
return pgather_common<Packet4f>(from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
|
||||||
{
|
{
|
||||||
return pgather_size16<Packet16uc>(from, stride);
|
return pgather_common<Packet4i>(from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
|
return pgather_common<Packet8s>(from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
return pgather_common<Packet8us>(from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
return pgather_common<Packet8bf>(from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
return pgather_common<Packet16c>(from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
return pgather_common<Packet16uc>(from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
|
||||||
|
{
|
||||||
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
|
||||||
|
eigen_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
|
||||||
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
||||||
to[0*stride] = a[0];
|
LOAD_STORE_UNROLL_16
|
||||||
to[1*stride] = a[1];
|
for (Index i = 0; i < n; i++) {
|
||||||
to[2*stride] = a[2];
|
to[i*stride] = a[i];
|
||||||
to[3*stride] = a[3];
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
||||||
{
|
{
|
||||||
pscatter_size4<Packet4f>(to, from, stride);
|
pscatter_common<Packet4f>(to, from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
||||||
{
|
{
|
||||||
pscatter_size4<Packet4i>(to, from, stride);
|
pscatter_common<Packet4i>(to, from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
|
pscatter_common<Packet8s>(to, from, stride);
|
||||||
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
|
||||||
to[0*stride] = a[0];
|
|
||||||
to[1*stride] = a[1];
|
|
||||||
to[2*stride] = a[2];
|
|
||||||
to[3*stride] = a[3];
|
|
||||||
to[4*stride] = a[4];
|
|
||||||
to[5*stride] = a[5];
|
|
||||||
to[6*stride] = a[6];
|
|
||||||
to[7*stride] = a[7];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
|
|
||||||
{
|
{
|
||||||
pscatter_size8<Packet8s>(to, from, stride);
|
pscatter_common<Packet8us>(to, from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
|
||||||
{
|
{
|
||||||
pscatter_size8<Packet8us>(to, from, stride);
|
pscatter_common<Packet8bf>(to, from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
|
||||||
{
|
{
|
||||||
pscatter_size8<Packet8bf>(to, from, stride);
|
pscatter_common<Packet16c>(to, from, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
|
pscatter_common<Packet16uc>(to, from, stride);
|
||||||
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
|
||||||
to[0*stride] = a[0];
|
|
||||||
to[1*stride] = a[1];
|
|
||||||
to[2*stride] = a[2];
|
|
||||||
to[3*stride] = a[3];
|
|
||||||
to[4*stride] = a[4];
|
|
||||||
to[5*stride] = a[5];
|
|
||||||
to[6*stride] = a[6];
|
|
||||||
to[7*stride] = a[7];
|
|
||||||
to[8*stride] = a[8];
|
|
||||||
to[9*stride] = a[9];
|
|
||||||
to[10*stride] = a[10];
|
|
||||||
to[11*stride] = a[11];
|
|
||||||
to[12*stride] = a[12];
|
|
||||||
to[13*stride] = a[13];
|
|
||||||
to[14*stride] = a[14];
|
|
||||||
to[15*stride] = a[15];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
|
||||||
{
|
{
|
||||||
pscatter_size16<Packet16c>(to, from, stride);
|
pscatter_common<Packet4f>(to, from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
|
||||||
{
|
{
|
||||||
pscatter_size16<Packet16uc>(to, from, stride);
|
pscatter_common<Packet4i>(to, from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet8s>(to, from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet8us>(to, from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet8bf>(to, from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet16c>(to, from, stride, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet16uc>(to, from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
|
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
|
||||||
@ -1008,6 +1209,73 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char
|
|||||||
return ploadu_common<Packet16uc>(from);
|
return ploadu_common<Packet16uc>(from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n <= packet_size && "number of elements will read past end of packet");
|
||||||
|
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
EIGEN_DEBUG_ALIGNED_LOAD
|
||||||
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
|
return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
|
||||||
|
#else
|
||||||
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
|
||||||
|
unsigned char* load2 = reinterpret_cast<unsigned char *>(load);
|
||||||
|
unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
||||||
|
Index n2 = n * size;
|
||||||
|
Index i = 0;
|
||||||
|
if (16 <= n2) {
|
||||||
|
pstore(load2, ploadu<Packet16uc>(from2));
|
||||||
|
i += 16;
|
||||||
|
}
|
||||||
|
if (i + 8 <= n2) {
|
||||||
|
*reinterpret_cast<uint64_t *>(load2 + i) = *reinterpret_cast<uint64_t *>(from2 + i);
|
||||||
|
i += 8;
|
||||||
|
}
|
||||||
|
if (i + 4 <= n2) {
|
||||||
|
*reinterpret_cast<uint32_t *>(load2 + i) = *reinterpret_cast<uint32_t *>(from2 + i);
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
if (i + 2 <= n2) {
|
||||||
|
*reinterpret_cast<uint16_t *>(load2 + i) = *reinterpret_cast<uint16_t *>(from2 + i);
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
if (i < n2) {
|
||||||
|
*reinterpret_cast<uint8_t *>(load2 + i) = *reinterpret_cast<uint8_t *>(from2 + i);
|
||||||
|
}
|
||||||
|
return pload_ignore<Packet>(load);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet4f>(from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet4i>(from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet8s>(from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet8us>(from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet16c>(from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet16uc>(from, n);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
|
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
|
||||||
{
|
{
|
||||||
Packet p;
|
Packet p;
|
||||||
@ -1128,6 +1396,77 @@ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* t
|
|||||||
pstoreu_common<Packet16uc>(to, from);
|
pstoreu_common<Packet16uc>(to, from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n)
|
||||||
|
{
|
||||||
|
const Index packet_size = unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(n <= packet_size && "number of elements will write past end of packet");
|
||||||
|
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
EIGEN_DEBUG_UNALIGNED_STORE
|
||||||
|
vec_xst_len(from, to, n * size);
|
||||||
|
#else
|
||||||
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
|
||||||
|
pstore(store, from);
|
||||||
|
unsigned char* store2 = reinterpret_cast<unsigned char *>(store);
|
||||||
|
unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
|
||||||
|
Index n2 = n * size;
|
||||||
|
Index i = 0;
|
||||||
|
if (16 <= n2) {
|
||||||
|
pstoreu(to2, pload<Packet16uc>(store2));
|
||||||
|
i += 16;
|
||||||
|
}
|
||||||
|
if (i + 8 <= n2) {
|
||||||
|
*reinterpret_cast<uint64_t *>(to2 + i) = *reinterpret_cast<uint64_t *>(store2 + i);
|
||||||
|
i += 8;
|
||||||
|
}
|
||||||
|
if (i + 4 <= n2) {
|
||||||
|
*reinterpret_cast<uint32_t *>(to2 + i) = *reinterpret_cast<uint32_t *>(store2 + i);
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
if (i + 2 <= n2) {
|
||||||
|
*reinterpret_cast<uint16_t *>(to2 + i) = *reinterpret_cast<uint16_t *>(store2 + i);
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
if (i < n2) {
|
||||||
|
*reinterpret_cast<uint8_t *>(to2 + i) = *reinterpret_cast<uint8_t *>(store2 + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOAD_STORE_UNROLL_16
|
||||||
|
for (Index i = 0; i < n; i++) {
|
||||||
|
to[i] = from[i];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet4f>(to, from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet4i>(to, from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet8s>(to, from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet8us>(to, from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet16c>(to, from, n);
|
||||||
|
}
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet16uc>(to, from, n);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||||
|
|
||||||
@ -2387,12 +2726,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
|
|||||||
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
|
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
return pload_partial_common<Packet2d>(from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
|
||||||
{
|
{
|
||||||
EIGEN_DEBUG_ALIGNED_STORE
|
EIGEN_DEBUG_ALIGNED_STORE
|
||||||
vec_xst(from, 0, to);
|
vec_xst(from, 0, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
|
||||||
|
{
|
||||||
|
pstore_partial_common<Packet2d>(to, from, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
||||||
Packet2d v = {from, from};
|
Packet2d v = {from, from};
|
||||||
return v;
|
return v;
|
||||||
@ -2414,19 +2763,21 @@ pbroadcast4<Packet2d>(const double *a,
|
|||||||
a3 = pset1<Packet2d>(a[3]);
|
a3 = pset1<Packet2d>(a[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 double af[2];
|
return pgather_common<Packet2d>(from, stride);
|
||||||
af[0] = from[0*stride];
|
|
||||||
af[1] = from[1*stride];
|
|
||||||
return pload<Packet2d>(af);
|
|
||||||
}
|
}
|
||||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 double af[2];
|
return pgather_common<Packet2d>(from, stride, n);
|
||||||
pstore<double>(af, from);
|
}
|
||||||
to[0*stride] = af[0];
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
|
||||||
to[1*stride] = af[1];
|
{
|
||||||
|
pscatter_common<Packet2d>(to, from, stride);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
|
||||||
|
{
|
||||||
|
pscatter_common<Packet2d>(to, from, stride, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
|
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
|
||||||
@ -2517,6 +2868,11 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
|
|||||||
return vec_xl(0, const_cast<double*>(from));
|
return vec_xl(0, const_cast<double*>(from));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n)
|
||||||
|
{
|
||||||
|
return ploadu_partial_common<Packet2d>(from, n);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||||
{
|
{
|
||||||
Packet2d p;
|
Packet2d p;
|
||||||
@ -2531,6 +2887,11 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
|
|||||||
vec_xst(from, 0, to);
|
vec_xst(from, 0, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n)
|
||||||
|
{
|
||||||
|
pstoreu_partial_common<Packet2d>(to, from, n);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
|
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
|
||||||
|
@ -100,6 +100,11 @@ public:
|
|||||||
return ploadt<PacketType, AlignmentType>(m_data + i);
|
return ploadt<PacketType, AlignmentType>(m_data + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const {
|
||||||
|
return ploadt_partial<PacketType, AlignmentType>(m_data + i, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename PacketType, int AlignmentT>
|
template<typename PacketType, int AlignmentT>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
|
||||||
return ploadt<PacketType, AlignmentT>(m_data + i);
|
return ploadt<PacketType, AlignmentT>(m_data + i);
|
||||||
@ -110,6 +115,11 @@ public:
|
|||||||
pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
|
pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const {
|
||||||
|
pstoret_partial<Scalar, PacketType, AlignmentType>(m_data + i, p, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Scalar *m_data;
|
Scalar *m_data;
|
||||||
};
|
};
|
||||||
@ -208,6 +218,11 @@ public:
|
|||||||
return ploadt<PacketType, AlignmentType>(&operator()(i, j));
|
return ploadt<PacketType, AlignmentType>(&operator()(i, j));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const {
|
||||||
|
return ploadt_partial<PacketType, AlignmentType>(&operator()(i, j), n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename PacketT, int AlignmentT>
|
template <typename PacketT, int AlignmentT>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
||||||
return ploadt<PacketT, AlignmentT>(&operator()(i, j));
|
return ploadt<PacketT, AlignmentT>(&operator()(i, j));
|
||||||
@ -218,6 +233,11 @@ public:
|
|||||||
pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
|
pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const {
|
||||||
|
pstoret_partial<Scalar, PacketType, AlignmentType>(&operator()(i, j), p, n, offset);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename SubPacket>
|
template<typename SubPacket>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||||
@ -271,11 +291,21 @@ public:
|
|||||||
return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
|
return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/) const {
|
||||||
|
return pgather_partial<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value(), n);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename PacketType>
|
template<typename PacketType>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
|
||||||
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
|
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/) const {
|
||||||
|
pscatter_partial<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value(), n);
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Scalar *m_data;
|
Scalar *m_data;
|
||||||
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
||||||
@ -312,6 +342,11 @@ public:
|
|||||||
return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
|
return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/) const {
|
||||||
|
return pgather_partial<Scalar,PacketType>(&operator()(i, j),m_incr.value(),n);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename PacketT, int AlignmentT>
|
template <typename PacketT, int AlignmentT>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
||||||
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
|
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
|
||||||
@ -322,6 +357,11 @@ public:
|
|||||||
pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
|
pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename PacketType>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/) const {
|
||||||
|
pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename SubPacket>
|
template<typename SubPacket>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||||
|
@ -458,6 +458,36 @@ void packetmath() {
|
|||||||
VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
|
VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int M = 0; M < PacketSize; ++M) {
|
||||||
|
for (int N = 0; N <= PacketSize; ++N) {
|
||||||
|
for (int j = 0; j < size; ++j) {
|
||||||
|
data1[j] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||||
|
data2[j] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||||
|
refvalue = (std::max)(refvalue, numext::abs(data1[j]));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (M == 0) {
|
||||||
|
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N), N);
|
||||||
|
VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN");
|
||||||
|
|
||||||
|
for (int offset = 0; offset < PacketSize; ++offset) {
|
||||||
|
internal::pstore_partial(data2, internal::ploadu_partial<Packet>(data1 + offset, N), N);
|
||||||
|
VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int offset = 0; offset < PacketSize; ++offset) {
|
||||||
|
internal::pstoreu_partial(data2 + offset, internal::pload_partial<Packet>(data1, N), N);
|
||||||
|
VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (N + M > PacketSize) continue; // Don't read or write past end of Packet
|
||||||
|
|
||||||
|
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N, M), N, M);
|
||||||
|
VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (internal::unpacket_traits<Packet>::masked_load_available) {
|
if (internal::unpacket_traits<Packet>::masked_load_available) {
|
||||||
test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
|
test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
|
||||||
unsigned long long max_umask = (0x1ull << PacketSize);
|
unsigned long long max_umask = (0x1ull << PacketSize);
|
||||||
@ -1372,6 +1402,36 @@ void packetmath_scatter_gather() {
|
|||||||
for (int i = 0; i < PacketSize; ++i) {
|
for (int i = 0; i < PacketSize; ++i) {
|
||||||
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
|
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (Index N = 0; N <= PacketSize; ++N) {
|
||||||
|
for (Index i = 0; i < N; ++i) {
|
||||||
|
data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Index i = 0; i < N * 20; ++i) {
|
||||||
|
buffer[i] = Scalar(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
packet = internal::pload_partial<Packet>(data1, N);
|
||||||
|
internal::pscatter_partial<Scalar, Packet>(buffer, packet, stride, N);
|
||||||
|
|
||||||
|
for (Index i = 0; i < N * 20; ++i) {
|
||||||
|
if ((i % stride) == 0 && i < stride * N) {
|
||||||
|
VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial");
|
||||||
|
} else {
|
||||||
|
VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Index i = 0; i < N * 7; ++i) {
|
||||||
|
buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||||
|
}
|
||||||
|
packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
|
||||||
|
internal::pstore_partial(data1, packet, N);
|
||||||
|
for (Index i = 0; i < N; ++i) {
|
||||||
|
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user