diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 4190d1bd1..f3f6a1a1b 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -73,30 +73,13 @@ struct packet_traits > : default_packet_traits { }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cf half; - typedef Packet2f as_real; - enum { - size = 1, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using as_real = Packet2f; }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cf half; - typedef Packet4f as_real; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using half = Packet1cf; + using as_real = Packet4f; }; template <> @@ -297,10 +280,12 @@ EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packe template <> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } template <> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } @@ -324,10 +309,12 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* fro template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet2cf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } @@ -538,21 +525,13 @@ struct packet_traits > : default_packet_traits { }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cd half; - typedef Packet2d as_real; - enum { - size = 1, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using as_real = Packet2d; }; template <> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } @@ -666,6 +645,7 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* fr template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cd& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9364cffca..135b7e4e4 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -437,224 +437,74 @@ struct packet_traits : default_packet_traits { }; }; +template +struct neon_unpacket_default { + using type = Scalar; + using half = Packet; + static constexpr int size = sizeof(Packet) / sizeof(Scalar); + static constexpr int alignment = sizeof(Packet); + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; +}; + template <> -struct unpacket_traits { - typedef float type; - typedef Packet2f half; - typedef Packet2i integer_packet; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using integer_packet = Packet2i; }; template <> -struct unpacket_traits { - typedef float type; - typedef Packet2f half; - typedef Packet4i integer_packet; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet2f; + using integer_packet = Packet4i; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet4c half; - enum { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4c; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet4c half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet8c; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet8c half; - enum { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4uc; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet4uc half; - enum { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet8uc; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet4uc half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4s; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet8uc half; - enum { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4us; }; template <> -struct unpacket_traits { - typedef int16_t type; - typedef Packet4s half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet2i; }; template <> -struct unpacket_traits { - typedef int16_t type; - typedef Packet4s half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet2ui; }; template <> -struct unpacket_traits { - typedef uint16_t type; - typedef Packet4us half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; template <> -struct unpacket_traits { - typedef uint16_t type; - typedef Packet4us half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int32_t type; - typedef Packet2i half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int32_t type; - typedef Packet2i half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint32_t type; - typedef Packet2ui half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint32_t type; - typedef Packet2ui half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int64_t type; - typedef Packet2l half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint64_t type; - typedef Packet2ul half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; template <> EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) { @@ -2417,10 +2267,12 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { template <> EIGEN_STRONG_INLINE Packet2f pload(const float* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } template <> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } template <> @@ -2431,10 +2283,12 @@ EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } template <> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } template <> @@ -2445,50 +2299,62 @@ EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } template <> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } template <> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } template <> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } template <> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } template <> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } template <> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } template <> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } template <> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } template <> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } template <> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } template <> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } @@ -2713,10 +2579,12 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } template <> @@ -2725,10 +2593,12 @@ EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) { } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from); } template <> @@ -2737,50 +2607,62 @@ EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) { } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from); } @@ -4801,17 +4683,7 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef bfloat16 type; - typedef Packet4bf half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; namespace detail { template <> @@ -4866,6 +4738,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { template <> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); return Packet4bf(pload(reinterpret_cast(from))); } @@ -4876,6 +4749,7 @@ EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { template <> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); } @@ -5201,17 +5075,8 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef double type; - typedef Packet2d half; - typedef Packet2l integer_packet; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using integer_packet = Packet2l; }; template <> @@ -5373,6 +5238,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { template <> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } @@ -5387,6 +5253,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { } template <> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } @@ -5579,29 +5446,10 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef Eigen::half type; - typedef Packet4hf half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - +struct unpacket_traits : neon_unpacket_default {}; template <> -struct unpacket_traits { - typedef Eigen::half type; - typedef Packet4hf half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet4hf; }; template <> @@ -5934,11 +5782,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packe template <> EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } @@ -6014,11 +5864,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); } template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 44056b334..d6c09a392 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1339,6 +1339,21 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) { } #endif +/** \internal + * This informs the implementation that PTR is aligned to at least ALIGN_BYTES + */ +#ifndef EIGEN_ASSUME_ALIGNED +#if defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ + { PTR = std::assume_aligned<8 * (ALIGN_BYTES)>(PTR); } +#elif EIGEN_HAS_BUILTIN(__builtin_assume_aligned) +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ + { PTR = static_cast(__builtin_assume_aligned(PTR, (ALIGN_BYTES))); } +#else +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) /* do nothing */ +#endif +#endif + } // end namespace internal } // end namespace Eigen