arm packet alignment requirements and aligned loads/stores

This commit is contained in:
Charles Schlosser 2025-07-15 23:49:04 +00:00
parent 430e35fbd1
commit 302fc46bc3
3 changed files with 120 additions and 273 deletions

View File

@ -73,30 +73,13 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
}; };
template <> template <>
struct unpacket_traits<Packet1cf> { struct unpacket_traits<Packet1cf> : neon_unpacket_default<Packet1cf, std::complex<float>> {
typedef std::complex<float> type; using as_real = Packet2f;
typedef Packet1cf half;
typedef Packet2f as_real;
enum {
size = 1,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet2cf> { struct unpacket_traits<Packet2cf> : neon_unpacket_default<Packet2cf, std::complex<float>> {
typedef std::complex<float> type; using half = Packet1cf;
typedef Packet1cf half; using as_real = Packet4f;
typedef Packet4f as_real;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
@ -297,10 +280,12 @@ EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packe
template <> template <>
EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) { EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet1cf>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from));
} }
template <> template <>
EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2cf>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from)));
} }
@ -324,10 +309,12 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* fro
template <> template <>
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) { EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet1cf>::alignment);
EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) { EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2cf>::alignment);
EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v);
} }
@ -538,21 +525,13 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
}; };
template <> template <>
struct unpacket_traits<Packet1cd> { struct unpacket_traits<Packet1cd> : neon_unpacket_default<Packet1cd, std::complex<double>> {
typedef std::complex<double> type; using as_real = Packet2d;
typedef Packet1cd half;
typedef Packet2d as_real;
enum {
size = 1,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet1cd>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from)));
} }
@ -666,6 +645,7 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
template <> template <>
EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) { EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet1cd>::alignment);
EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v);
} }

View File

@ -437,224 +437,74 @@ struct packet_traits<uint64_t> : default_packet_traits {
}; };
}; };
template <typename Packet, typename Scalar>
struct neon_unpacket_default {
using type = Scalar;
using half = Packet;
static constexpr int size = sizeof(Packet) / sizeof(Scalar);
static constexpr int alignment = sizeof(Packet);
static constexpr bool vectorizable = true;
static constexpr bool masked_load_available = false;
static constexpr bool masked_store_available = false;
};
template <> template <>
struct unpacket_traits<Packet2f> { struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
typedef float type; using integer_packet = Packet2i;
typedef Packet2f half;
typedef Packet2i integer_packet;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet4f> { struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
typedef float type; using half = Packet2f;
typedef Packet2f half; using integer_packet = Packet4i;
typedef Packet4i integer_packet;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet4c> { struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
typedef int8_t type; template <>
typedef Packet4c half; struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
enum { using half = Packet4c;
size = 4,
alignment = Unaligned,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet8c> { struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
typedef int8_t type; using half = Packet8c;
typedef Packet4c half;
enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet16c> { struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
typedef int8_t type; template <>
typedef Packet8c half; struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
enum { using half = Packet4uc;
size = 16,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet4uc> { struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
typedef uint8_t type; using half = Packet8uc;
typedef Packet4uc half;
enum {
size = 4,
alignment = Unaligned,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet8uc> { struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
typedef uint8_t type; template <>
typedef Packet4uc half; struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
enum { using half = Packet4s;
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet16uc> { struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
typedef uint8_t type; template <>
typedef Packet8uc half; struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
enum { using half = Packet4us;
size = 16,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet4s> { struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
typedef int16_t type; template <>
typedef Packet4s half; struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
enum { using half = Packet2i;
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet8s> { struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
typedef int16_t type; template <>
typedef Packet4s half; struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
enum { using half = Packet2ui;
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
struct unpacket_traits<Packet4us> { struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
typedef uint16_t type;
typedef Packet4us half;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <> template <>
struct unpacket_traits<Packet8us> { struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
typedef uint16_t type;
typedef Packet4us half;
enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet2i> {
typedef int32_t type;
typedef Packet2i half;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
typedef Packet2i half;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet2ui> {
typedef uint32_t type;
typedef Packet2ui half;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet4ui> {
typedef uint32_t type;
typedef Packet2ui half;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet2l> {
typedef int64_t type;
typedef Packet2l half;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet2ul> {
typedef uint64_t type;
typedef Packet2ul half;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <> template <>
EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) { EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
@ -2417,10 +2267,12 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
template <> template <>
EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) { EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2f>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4f>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
} }
template <> template <>
@ -2431,10 +2283,12 @@ EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) { EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8c>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) { EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16c>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
} }
template <> template <>
@ -2445,50 +2299,62 @@ EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) { EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8uc>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) { EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16uc>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) { EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4s>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) { EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8s>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) { EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4us>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) { EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8us>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) { EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2i>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4i>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) { EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ui>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) { EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4ui>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) { EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2l>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) { EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ul>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
} }
@ -2713,10 +2579,12 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
template <> template <>
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) { EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2f>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4f>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
} }
template <> template <>
@ -2725,10 +2593,12 @@ EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) { EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8c>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) { EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16c>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
} }
template <> template <>
@ -2737,50 +2607,62 @@ EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) { EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8uc>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) { EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16uc>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) { EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4s>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) { EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8s>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) { EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4us>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) { EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8us>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) { EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2i>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4i>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) { EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ui>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4ui>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) { EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2l>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) { EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ul>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
} }
@ -4801,17 +4683,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
}; };
template <> template <>
struct unpacket_traits<Packet4bf> { struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
typedef bfloat16 type;
typedef Packet4bf half;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
namespace detail { namespace detail {
template <> template <>
@ -4866,6 +4738,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
template <> template <>
EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) { EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4bf>::alignment);
return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from))); return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
} }
@ -4876,6 +4749,7 @@ EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
template <> template <>
EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) { EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4bf>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
} }
@ -5201,17 +5075,8 @@ struct packet_traits<double> : default_packet_traits {
}; };
template <> template <>
struct unpacket_traits<Packet2d> { struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
typedef double type; using integer_packet = Packet2l;
typedef Packet2d half;
typedef Packet2l integer_packet;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
@ -5373,6 +5238,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
template <> template <>
EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2d>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
} }
@ -5387,6 +5253,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2d>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
} }
@ -5579,29 +5446,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
}; };
template <> template <>
struct unpacket_traits<Packet4hf> { struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
typedef Eigen::half type;
typedef Packet4hf half;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <> template <>
struct unpacket_traits<Packet8hf> { struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
typedef Eigen::half type; using half = Packet4hf;
typedef Packet4hf half;
enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
}; };
template <> template <>
@ -5934,11 +5782,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packe
template <> template <>
EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) { EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8hf>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from)); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) { EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4hf>::alignment);
EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from)); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
} }
@ -6014,11 +5864,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a,
template <> template <>
EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) { EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8hf>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from); EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
} }
template <> template <>
EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) { EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4hf>::alignment);
EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from); EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
} }

View File

@ -1339,6 +1339,21 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) {
} }
#endif #endif
/** \internal
* This informs the implementation that PTR is aligned to at least ALIGN_BYTES
*/
#ifndef EIGEN_ASSUME_ALIGNED
#if defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L)
#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \
{ PTR = std::assume_aligned<8 * (ALIGN_BYTES)>(PTR); }
#elif EIGEN_HAS_BUILTIN(__builtin_assume_aligned)
#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \
{ PTR = static_cast<decltype(PTR)>(__builtin_assume_aligned(PTR, (ALIGN_BYTES))); }
#else
#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) /* do nothing */
#endif
#endif
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen