diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index bfc7ae68a..3d2f144d7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -730,14 +730,14 @@ ploadu(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns n elements of a packet version of \a *from, (un-aligned load) * All elements after the last element loaded will initialized with zero */ template EIGEN_DEVICE_FUNC inline Packet -ploadu_partial(const typename unpacket_traits::type* from, const Index n) +ploadu_partial(const typename unpacket_traits::type* from, const Index n, const Index offset = 0) { const Index packet_size = unpacket_traits::size; - eigen_assert(n <= packet_size && "number of elements will read past end of packet"); + eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); typedef typename unpacket_traits::type Scalar; EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; - for (Index i = 0; i < numext::mini(n,packet_size); i++) { - elements[i] = from[i]; + for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) { + elements[i] = from[i-offset]; } return pload(elements); } @@ -855,14 +855,14 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { (*to) = from; } /** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */ -template EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n) +template EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) { const Index packet_size = unpacket_traits::size; - eigen_assert(n <= packet_size && "number of elements will write past end of packet"); + eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); EIGEN_ALIGN_MAX Scalar elements[packet_size]; pstore(elements, from); - for (Index i = 0; i < numext::mini(n,packet_size); i++) { - to[i] = elements[i]; + for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) { + to[i] = elements[i + offset]; } } @@ -1201,7 +1201,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpac if(Alignment >= unpacket_traits::alignment) return pload_partial(from, n, offset); else - return ploadu_partial(from, n); + return ploadu_partial(from, n, offset); } /** \internal copy the packet \a from to \a *to. @@ -1223,7 +1223,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Pac if(Alignment >= unpacket_traits::alignment) pstore_partial(to, from, n, offset); else - pstoreu_partial(to, from, n); + pstoreu_partial(to, from, n, offset); } /** \internal \returns a packet version of \a *from. diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 69cc0689f..e24581fd4 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -136,16 +136,16 @@ template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial(const std::com { return Packet2cf(pload_partial((const float *) from, n * 2, offset * 2)); } -template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial(const std::complex* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial(const std::complex* from, const Index n, const Index offset) { - return Packet2cf(ploadu_partial((const float*) from, n * 2)); + return Packet2cf(ploadu_partial((const float*) from, n * 2, offset * 2)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet2cf& from, const Index n, const Index offset) { pstoreu_partial((float*)to, from.v, n * 2, offset * 2); } EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { @@ -382,14 +382,14 @@ template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial(const std::com { return Packet1cd(pload_partial((const double*)from, n * 2, offset * 2)); } -template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial(const std::complex* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial(const std::complex* from, const Index n, const Index offset) { - return Packet1cd(ploadu_partial((const double*)from, n * 2)); + return Packet1cd(ploadu_partial((const double*)from, n * 2, offset * 2)); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet1cd& from, const Index n, const Index offset) { pstoreu_partial((double*)to, from.v, n * 2, offset * 2); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 7e0c75918..f12dc19ac 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1260,26 +1260,36 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char return ploadu_common(from); } -template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n) +template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset) { const Index packet_size = unpacket_traits::size; - eigen_internal_assert(n <= packet_size && "number of elements will read past end of packet"); + eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_DEBUG_ALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); + Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + load = Packet(vec_sro(Packet16uc(load), shift)); +#else + load = Packet(vec_slo(Packet16uc(load), shift)); +#endif + } + return load; #else if (n) { + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; + unsigned char* load2 = reinterpret_cast(load + offset); + unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); Index n2 = n * size; if (16 <= n2) { - return ploadu(from); + pstoreu(load2, ploadu(from2)); + } else { + memcpy((void *)load2, (void *)from2, n2); } - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; - unsigned char* load2 = reinterpret_cast(load); - unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); - memcpy((void *)load2, (void *)from2, n2); return pload_ignore(load); } else { return Packet(pset1(0)); @@ -1287,33 +1297,33 @@ template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const #endif } -template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial(const float* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial(const float* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial(const int* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial(const int* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial(const short int* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial(const short int* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial(const unsigned short int* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial(const unsigned short int* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial(const bfloat16* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial(const bfloat16* from, const Index n, const Index offset) { - return ploadu_partial_common(reinterpret_cast(from), n); + return ploadu_partial_common(reinterpret_cast(from), n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial(const signed char* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial(const signed char* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } -template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial(const unsigned char* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial(const unsigned char* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) @@ -1436,57 +1446,67 @@ template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* t pstoreu_common(to, from); } -template EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n) +template EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset) { const Index packet_size = unpacket_traits::size; - eigen_internal_assert(n <= packet_size && "number of elements will write past end of packet"); + eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); const Index size = sizeof(__UNPACK_TYPE__(Packet)); #ifdef _ARCH_PWR9 EIGEN_UNUSED_VARIABLE(packet_size); EIGEN_DEBUG_UNALIGNED_STORE - vec_xst_len(from, to, n * size); + Packet store = from; + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + store = Packet(vec_slo(Packet16uc(store), shift)); +#else + store = Packet(vec_sro(Packet16uc(store), shift)); +#endif + } + vec_xst_len(store, to, n * size); #else if (n) { - Index n2 = n * size; - if (16 <= n2) { - pstoreu(to, from); - } EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; pstore(store, from); - unsigned char* store2 = reinterpret_cast(store); + unsigned char* store2 = reinterpret_cast(store + offset); unsigned char* to2 = reinterpret_cast(to); - memcpy((void *)to2, (void *)store2, n2); + Index n2 = n * size; + if (16 <= n2) { + pstoreu(to2, ploadu(store2)); + } else { + memcpy((void *)to2, (void *)store2, n2); + } } #endif } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(float* to, const Packet4f& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(float* to, const Packet4f& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(int* to, const Packet4i& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(int* to, const Packet4i& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(short int* to, const Packet8s& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(short int* to, const Packet8s& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned short int* to, const Packet8us& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned short int* to, const Packet8us& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(bfloat16* to, const Packet8bf& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(bfloat16* to, const Packet8bf& from, const Index n, const Index offset) { - pstoreu_partial_common(reinterpret_cast(to), from, n); + pstoreu_partial_common(reinterpret_cast(to), from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(signed char* to, const Packet16c& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(signed char* to, const Packet16c& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned char* to, const Packet16uc& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned char* to, const Packet16uc& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } @@ -2953,9 +2973,9 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) return vec_xl(0, const_cast(from)); } -template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial(const double* from, const Index n) +template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial(const double* from, const Index n, const Index offset) { - return ploadu_partial_common(from, n); + return ploadu_partial_common(from, n, offset); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -2972,9 +2992,9 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& vec_xst(from, 0, to); } -template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(double* to, const Packet2d& from, const Index n) +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(double* to, const Packet2d& from, const Index n, const Index offset) { - pstoreu_partial_common(to, from, n); + pstoreu_partial_common(to, from, n, offset); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); }