From 430e35fbd15d3c946d2d2ba19ec41c16ba217cb3 Mon Sep 17 00:00:00 2001 From: Sean McBride Date: Fri, 11 Jul 2025 11:30:23 -0400 Subject: [PATCH 01/70] Fixed -Wshadow warning by renaming variables --- Eigen/src/Core/GenericPacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 4287fa249..e1d62fa17 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -638,7 +638,7 @@ struct pminmax_impl { } }; -#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& a, const Type& b) { return Func(a, b); } +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& aa, const Type& bb) { return Func(aa, bb); } /** \internal \returns the min of \a a and \a b (coeff-wise). If \a a or \b b is NaN, the return value is implementation defined. */ From 302fc46bc3258bbdf8594475b0eb79c8c88bb143 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Tue, 15 Jul 2025 23:49:04 +0000 Subject: [PATCH 02/70] arm packet alignment requirements and aligned loads/stores --- Eigen/src/Core/arch/NEON/Complex.h | 46 +--- Eigen/src/Core/arch/NEON/PacketMath.h | 332 +++++++------------------- Eigen/src/Core/util/Memory.h | 15 ++ 3 files changed, 120 insertions(+), 273 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 4190d1bd1..f3f6a1a1b 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -73,30 +73,13 @@ struct packet_traits > : default_packet_traits { }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cf half; - typedef Packet2f as_real; - enum { - size = 1, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using as_real = Packet2f; }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cf half; - typedef Packet4f as_real; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using half = Packet1cf; + using as_real = Packet4f; }; template <> @@ -297,10 +280,12 @@ EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packe template <> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } template <> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } @@ -324,10 +309,12 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* fro template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet2cf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } @@ -538,21 +525,13 @@ struct packet_traits > : default_packet_traits { }; template <> -struct unpacket_traits { - typedef std::complex type; - typedef Packet1cd half; - typedef Packet2d as_real; - enum { - size = 1, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default> { + using as_real = Packet2d; }; template <> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } @@ -666,6 +645,7 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* fr template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cd& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9364cffca..135b7e4e4 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -437,224 +437,74 @@ struct packet_traits : default_packet_traits { }; }; +template +struct neon_unpacket_default { + using type = Scalar; + using half = Packet; + static constexpr int size = sizeof(Packet) / sizeof(Scalar); + static constexpr int alignment = sizeof(Packet); + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; +}; + template <> -struct unpacket_traits { - typedef float type; - typedef Packet2f half; - typedef Packet2i integer_packet; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using integer_packet = Packet2i; }; template <> -struct unpacket_traits { - typedef float type; - typedef Packet2f half; - typedef Packet4i integer_packet; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet2f; + using integer_packet = Packet4i; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet4c half; - enum { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4c; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet4c half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet8c; }; template <> -struct unpacket_traits { - typedef int8_t type; - typedef Packet8c half; - enum { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4uc; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet4uc half; - enum { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet8uc; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet4uc half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4s; }; template <> -struct unpacket_traits { - typedef uint8_t type; - typedef Packet8uc half; - enum { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet4us; }; template <> -struct unpacket_traits { - typedef int16_t type; - typedef Packet4s half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet2i; }; template <> -struct unpacket_traits { - typedef int16_t type; - typedef Packet4s half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default {}; +template <> +struct unpacket_traits : neon_unpacket_default { + using half = Packet2ui; }; template <> -struct unpacket_traits { - typedef uint16_t type; - typedef Packet4us half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; template <> -struct unpacket_traits { - typedef uint16_t type; - typedef Packet4us half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int32_t type; - typedef Packet2i half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int32_t type; - typedef Packet2i half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint32_t type; - typedef Packet2ui half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint32_t type; - typedef Packet2ui half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef int64_t type; - typedef Packet2l half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template <> -struct unpacket_traits { - typedef uint64_t type; - typedef Packet2ul half; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; template <> EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) { @@ -2417,10 +2267,12 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { template <> EIGEN_STRONG_INLINE Packet2f pload(const float* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } template <> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } template <> @@ -2431,10 +2283,12 @@ EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } template <> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } template <> @@ -2445,50 +2299,62 @@ EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } template <> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } template <> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } template <> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } template <> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } template <> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } template <> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } template <> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } template <> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } template <> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } template <> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } template <> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } @@ -2713,10 +2579,12 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } template <> @@ -2725,10 +2593,12 @@ EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) { } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from); } template <> @@ -2737,50 +2607,62 @@ EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) { } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from); } template <> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from); } template <> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from); } @@ -4801,17 +4683,7 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef bfloat16 type; - typedef Packet4bf half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +struct unpacket_traits : neon_unpacket_default {}; namespace detail { template <> @@ -4866,6 +4738,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { template <> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); return Packet4bf(pload(reinterpret_cast(from))); } @@ -4876,6 +4749,7 @@ EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { template <> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); } @@ -5201,17 +5075,8 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef double type; - typedef Packet2d half; - typedef Packet2l integer_packet; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using integer_packet = Packet2l; }; template <> @@ -5373,6 +5238,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { template <> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } @@ -5387,6 +5253,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { } template <> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } @@ -5579,29 +5446,10 @@ struct packet_traits : default_packet_traits { }; template <> -struct unpacket_traits { - typedef Eigen::half type; - typedef Packet4hf half; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - +struct unpacket_traits : neon_unpacket_default {}; template <> -struct unpacket_traits { - typedef Eigen::half type; - typedef Packet4hf half; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; +struct unpacket_traits : neon_unpacket_default { + using half = Packet4hf; }; template <> @@ -5934,11 +5782,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packe template <> EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { + EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } @@ -6014,11 +5864,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); } template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { + EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 44056b334..d6c09a392 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1339,6 +1339,21 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) { } #endif +/** \internal + * This informs the implementation that PTR is aligned to at least ALIGN_BYTES + */ +#ifndef EIGEN_ASSUME_ALIGNED +#if defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ + { PTR = std::assume_aligned<8 * (ALIGN_BYTES)>(PTR); } +#elif EIGEN_HAS_BUILTIN(__builtin_assume_aligned) +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ + { PTR = static_cast(__builtin_assume_aligned(PTR, (ALIGN_BYTES))); } +#else +#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) /* do nothing */ +#endif +#endif + } // end namespace internal } // end namespace Eigen From cedf1f4c17aad15d6892d0ed315420076f178257 Mon Sep 17 00:00:00 2001 From: Kuan-Ting Date: Mon, 14 Jul 2025 14:18:00 +0800 Subject: [PATCH 03/70] Fix typo: duplicated 'for' in docs --- Eigen/src/Core/MathFunctionsImpl.h | 6 +++--- doc/QuickReference.dox | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index cf8dcc3b8..c4b5da3cc 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -28,7 +28,7 @@ namespace internal { 2. If a is zero, approx_a_recip must be infinite with the same sign as a. 3. If a is infinite, approx_a_recip must be zero with the same sign as a. - If the preconditions are satisfied, which they are for for the _*_rcp_ps + If the preconditions are satisfied, which they are for the _*_rcp_ps instructions on x86, the result has a maximum relative error of 2 ulps, and correctly handles reciprocals of zero, infinity, and NaN. */ @@ -66,7 +66,7 @@ struct generic_reciprocal_newton_step { 2. If a is zero, approx_a_recip must be infinite with the same sign as a. 3. If a is infinite, approx_a_recip must be zero with the same sign as a. - If the preconditions are satisfied, which they are for for the _*_rcp_ps + If the preconditions are satisfied, which they are for the _*_rcp_ps instructions on x86, the result has a maximum relative error of 2 ulps, and correctly handles zero, infinity, and NaN. Positive denormals are treated as zero. @@ -116,7 +116,7 @@ struct generic_rsqrt_newton_step { 2. If a is zero, approx_rsqrt must be infinite. 3. If a is infinite, approx_rsqrt must be zero. - If the preconditions are satisfied, which they are for for the _*_rsqrt_ps + If the preconditions are satisfied, which they are for the _*_rsqrt_ps instructions on x86, the result has a maximum relative error of 2 ulps, and correctly handles zero and infinity, and NaN. Positive denormal inputs are treated as zero. diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index c61d47afa..1fb000fdc 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -449,7 +449,7 @@ conj(array1) -Some coefficient-wise operators are readily available for for matrices and vectors through the following cwise* methods: +Some coefficient-wise operators are readily available for matrices and vectors through the following cwise* methods:
Matrix API \matrixworldVia Array conversions
\code From d7fa5ebe0e5ce1fc8d8b61401148832f4d61c49c Mon Sep 17 00:00:00 2001 From: jacques FRANC Date: Thu, 17 Jul 2025 15:27:26 +0000 Subject: [PATCH 04/70] Fix API incompatibility for ILU in superLU support --- Eigen/src/SuperLUSupport/SuperLUSupport.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index 0c101494a..b5d29b219 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -65,6 +65,24 @@ DECL_GSSVX(z, double, std::complex) #ifdef EIGEN_SUPERLU_HAS_ILU // similarly for the incomplete factorization using gsisx +#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5) +#define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE) \ + extern "C" { \ + extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \ + SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, \ + FLOATTYPE *, GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *); \ + } \ + inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, \ + char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work, \ + int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth, \ + FLOATTYPE *rcond, SuperLUStat_t *stats, int *info, KEYTYPE) { \ + mem_usage_t mem_usage; \ + GlobalLU_t gLU; \ + PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond, \ + &gLU, &mem_usage, stats, info); \ + return mem_usage.for_lu; /* bytes used by the factor storage */ \ + } +#else // version < 5.0 #define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE) \ extern "C" { \ extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \ @@ -80,6 +98,7 @@ DECL_GSSVX(z, double, std::complex) &mem_usage, stats, info); \ return mem_usage.for_lu; /* bytes used by the factor storage */ \ } +#endif DECL_GSISX(s, float, float) DECL_GSISX(c, float, std::complex) From 2cf66d4b0d0ba52cbf2507e15998c4735aa14406 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 17 Jul 2025 21:20:39 +0000 Subject: [PATCH 05/70] Use numext::fma in more places in SparseCore. --- Eigen/src/SparseCore/SparseDot.h | 2 +- Eigen/src/SparseCore/TriangularSolver.h | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h index 76a4f6cb7..8aeebc8f4 100644 --- a/Eigen/src/SparseCore/SparseDot.h +++ b/Eigen/src/SparseCore/SparseDot.h @@ -67,7 +67,7 @@ inline typename internal::traits::Scalar SparseMatrixBase::dot Scalar res(0); while (i && j) { if (i.index() == j.index()) { - res += numext::conj(i.value()) * j.value(); + res = numext::fma(numext::conj(i.value()), j.value(), res); ++i; ++j; } else if (i.index() < j.index()) diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h index 7753a246a..10e27d70f 100644 --- a/Eigen/src/SparseCore/TriangularSolver.h +++ b/Eigen/src/SparseCore/TriangularSolver.h @@ -41,7 +41,7 @@ struct sparse_solve_triangular_selector { lastVal = it.value(); lastIndex = it.index(); if (lastIndex == i) break; - tmp -= lastVal * other.coeff(lastIndex, col); + tmp = numext::fma(-lastVal, other.coeff(lastIndex, col), tmp); } if (Mode & UnitDiag) other.coeffRef(i, col) = tmp; @@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector { } else if (it && it.index() == i) ++it; for (; it; ++it) { - tmp -= it.value() * other.coeff(it.index(), col); + tmp = numext::fma(-it.value(), other.coeff(it.index(), col), tmp); } if (Mode & UnitDiag) @@ -107,7 +107,9 @@ struct sparse_solve_triangular_selector { tmp /= it.value(); } if (it && it.index() == i) ++it; - for (; it; ++it) other.coeffRef(it.index(), col) -= tmp * it.value(); + for (; it; ++it) { + other.coeffRef(it.index(), col) = numext::fma(-tmp, it.value(), other.coeffRef(it.index(), col)); + } } } } @@ -135,7 +137,9 @@ struct sparse_solve_triangular_selector { other.coeffRef(i, col) /= it.value(); } LhsIterator it(lhsEval, i); - for (; it && it.index() < i; ++it) other.coeffRef(it.index(), col) -= tmp * it.value(); + for (; it && it.index() < i; ++it) { + other.coeffRef(it.index(), col) = numext::fma(-tmp, it.value(), other.coeffRef(it.index(), col)); + } } } } @@ -215,9 +219,13 @@ struct sparse_solve_triangular_sparse_selector { tempVector.restart(); if (IsLower) { if (it.index() == i) ++it; - for (; it; ++it) tempVector.coeffRef(it.index()) -= ci * it.value(); + for (; it; ++it) { + tempVector.coeffRef(it.index()) = numext::fma(-ci, it.value(), tempVector.coeffRef(it.index())); + } } else { - for (; it && it.index() < i; ++it) tempVector.coeffRef(it.index()) -= ci * it.value(); + for (; it && it.index() < i; ++it) { + tempVector.coeffRef(it.index()) = numext::fma(-ci, it.value(), tempVector.coeffRef(it.index())); + } } } } From efe5b6979d6430f4db260672a5432f34ec2e4d91 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 18 Jul 2025 18:06:28 +0000 Subject: [PATCH 06/70] Unconditionally include . Some c++20 builds are currently broken because it is needed for std::assume_aligned. --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index cf2b164b7..46e6f7294 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -92,6 +92,7 @@ #include #include +#include #include // for std::is_nothrow_move_assignable @@ -121,7 +122,6 @@ #undef isfinite #include #include -#include #include #include #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0 From 97c7cc62006ee303609b4b56a9deb49e6a55f528 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 18 Jul 2025 21:51:42 +0000 Subject: [PATCH 07/70] Explicitly use the packet trait HasPow to control whether Pow is vectorized. --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 ++ Eigen/src/Core/arch/AVX512/PacketMath.h | 2 ++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 3 +++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 ++ Eigen/src/Core/arch/SSE/PacketMath.h | 2 ++ Eigen/src/Core/arch/SVE/PacketMath.h | 2 ++ Eigen/src/Core/arch/ZVector/PacketMath.h | 1 + Eigen/src/Core/functors/BinaryFunctors.h | 6 +----- 8 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index eb5da53d0..0cd9e6cd6 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -118,6 +118,7 @@ struct packet_traits : default_packet_traits { HasLog1p = 1, HasExpm1 = 1, HasExp = 1, + HasPow = 1, HasNdtri = 1, HasBessel = 1, HasSqrt = 1, @@ -149,6 +150,7 @@ struct packet_traits : default_packet_traits { HasErf = 1, HasErfc = 1, HasExp = 1, + HasPow = 1, HasSqrt = 1, HasRsqrt = 1, HasCbrt = 1, diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 932b0568d..b76c8a77f 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -135,6 +135,7 @@ struct packet_traits : default_packet_traits { HasNdtri = 1, HasBessel = 1, HasExp = 1, + HasPow = 1, HasReciprocal = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, @@ -159,6 +160,7 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, + HasPow = 1, HasATan = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d7bd9bee4..eefe326b0 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -185,6 +185,8 @@ struct packet_traits : default_packet_traits { HasLog = 1, HasExp = 1, #ifdef EIGEN_VECTORIZE_VSX + HasCmp = 1, + HasPow = 1, HasSqrt = 1, HasCbrt = 1, #if !EIGEN_COMP_CLANG @@ -3175,6 +3177,7 @@ struct packet_traits : default_packet_traits { HasATanh = 1, HasATan = 0, HasLog = 0, + HasCmp = 1, HasExp = 1, HasSqrt = 1, HasCbrt = 1, diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 135b7e4e4..bea50a3ef 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -205,6 +205,7 @@ struct packet_traits : default_packet_traits { HasATanh = 1, HasLog = 1, HasExp = 1, + HasPow = 1, HasSqrt = 1, HasRsqrt = 1, HasCbrt = 1, @@ -5060,6 +5061,7 @@ struct packet_traits : default_packet_traits { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG HasExp = 1, HasLog = 1, + HasPow = 1, HasATan = 1, HasATanh = 1, #endif diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index e8902cff6..64ba7ba3a 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -192,6 +192,7 @@ struct packet_traits : default_packet_traits { HasExpm1 = 1, HasNdtri = 1, HasExp = 1, + HasPow = 1, HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, @@ -221,6 +222,7 @@ struct packet_traits : default_packet_traits { HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasExp = 1, + HasPow = 1, HasSqrt = 1, HasRsqrt = 1, HasCbrt = 1, diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h index 952d7561b..6115d1d35 100644 --- a/Eigen/src/Core/arch/SVE/PacketMath.h +++ b/Eigen/src/Core/arch/SVE/PacketMath.h @@ -354,10 +354,12 @@ struct packet_traits : default_packet_traits { HasDiv = 1, + HasCmp = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, + HasPow = 1, HasSqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index b45681320..4d18af0c0 100644 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -180,6 +180,7 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 4, + HasCmp = 1, HasAdd = 1, HasSub = 1, HasMul = 1, diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index a93b998b9..b6ecfb5d5 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -362,11 +362,7 @@ template struct functor_traits> { enum { Cost = 5 * NumTraits::MulCost, - PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && packet_traits::HasExp && - packet_traits::HasLog && packet_traits::HasRound && packet_traits::HasCmp && - // Temporarily disable packet access for half/bfloat16 until - // accuracy is improved. - !is_same::value && !is_same::value) + PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && packet_traits::HasPow) }; }; From b5bef9dcb09d3fa2eeb913419902fbad03fe8e4b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 18 Jul 2025 17:58:48 -0700 Subject: [PATCH 08/70] Fix bug in Erfc introduced in !1862. --- unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 1c3ce8738..228a7a5d4 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -323,7 +323,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x const T num = ppolevl::run(q2, gamma); const T denom = pmul(x, ppolevl::run(q2, delta)); const T r = pdiv(num, denom); - const T maybe_two = pand(pcmp_lt(x, pset1(0.0)), pset1(2.0)); + const T maybe_two = pselect(pcmp_lt(x, pset1(0.0)), pset1(2.0), pset1(0.0)); const T erfc_large = pmadd(z, r, maybe_two); return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } @@ -397,7 +397,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erfc_double_large(const T& x, const T& x const T num_large = ppolevl::run(q2, gamma); const T denom_large = pmul(x, ppolevl::run(q2, delta)); const T r = pdiv(num_large, denom_large); - const T maybe_two = pand(pcmp_lt(x, pset1(0.0)), pset1(2.0)); + const T maybe_two = pselect(pcmp_lt(x, pset1(0.0)), pset1(2.0), pset1(0.0)); return pmadd(z, r, maybe_two); } From abeba8535664525fa8ce5764eb2309ffc7bca21b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 19 Jul 2025 01:17:12 +0000 Subject: [PATCH 09/70] Use proper float literals in SpecialFunctionsImpl.h. --- unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 228a7a5d4..387836b72 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -323,7 +323,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x const T num = ppolevl::run(q2, gamma); const T denom = pmul(x, ppolevl::run(q2, delta)); const T r = pdiv(num, denom); - const T maybe_two = pselect(pcmp_lt(x, pset1(0.0)), pset1(2.0), pset1(0.0)); + const T maybe_two = pselect(pcmp_lt(x, pset1(0.0f)), pset1(2.0f), pset1(0.0f)); const T erfc_large = pmadd(z, r, maybe_two); return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } From 1e65707aa20603fc2ee9c2ac21c466ef57d23e10 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Wed, 23 Jul 2025 22:26:40 +0000 Subject: [PATCH 10/70] Suppress Warray-bounds warning in generic ploaduSegment, fix edge case for vectorized cast --- Eigen/src/Core/CoreEvaluators.h | 6 +++--- Eigen/src/Core/GenericPacketMath.h | 11 +++++++---- test/packet_segment.cpp | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 63f1895d2..ec731acd5 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -707,7 +707,7 @@ struct unary_evaluator, ArgType>, In Index packetOffset = offset * PacketSize; Index actualRow = IsRowMajor ? row : row + packetOffset; Index actualCol = IsRowMajor ? col + packetOffset : col; - eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds"); + eigen_assert(check_array_bounds(actualRow, actualCol, begin, count) && "Array index out of bounds"); return m_argImpl.template packetSegment(actualRow, actualCol, begin, count); } template @@ -715,8 +715,8 @@ struct unary_evaluator, ArgType>, In Index offset) const { constexpr int PacketSize = unpacket_traits::size; Index packetOffset = offset * PacketSize; - Index actualIndex = index + packetOffset + begin; - eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds"); + Index actualIndex = index + packetOffset; + eigen_assert(check_array_bounds(actualIndex, begin, count) && "Array index out of bounds"); return m_argImpl.template packetSegment(actualIndex, begin, count); } diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e1d62fa17..de599a15c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -1596,9 +1596,10 @@ EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits::type; constexpr Index PacketSize = unpacket_traits::size; eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); - Scalar aux[PacketSize]; - memset(static_cast(aux), 0x00, sizeof(Scalar) * PacketSize); - smart_copy(from + begin, from + begin + count, aux + begin); + Scalar aux[PacketSize] = {}; + for (Index k = begin; k < begin + count; k++) { + aux[k] = from[k]; + } return ploadu(aux); } @@ -1619,7 +1620,9 @@ EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Ind eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); Scalar aux[PacketSize]; pstoreu(aux, from); - smart_copy(aux + begin, aux + begin + count, to + begin); + for (Index k = begin; k < begin + count; k++) { + to[k] = aux[k]; + } } /** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. diff --git a/test/packet_segment.cpp b/test/packet_segment.cpp index 6fa6a290d..8a5469cb4 100644 --- a/test/packet_segment.cpp +++ b/test/packet_segment.cpp @@ -142,6 +142,21 @@ struct packet_segment_test_driver { static void run() {} }; +template ::Vectorizable> +void testReverseEdgeCase() { + // this reversed cast uses a non-zero offset for ploadSegment + Index size = 16 * internal::packet_traits::size + 1; + VectorX v1(size); + VectorX v2(size), v3(size); + v1.setRandom(); + v2 = v1.reverse().cast(); + v3 = v1.cast().reverse(); + VERIFY_IS_EQUAL(v2, v3); +} + +template <> +void testReverseEdgeCase() {} + template void test_packet_segment() { packet_segment_test_driver::size>::run(); @@ -164,5 +179,6 @@ EIGEN_DECLARE_TEST(packet_segment) { test_packet_segment(); test_packet_segment>(); test_packet_segment>(); + testReverseEdgeCase(); } } From f5ead2d34c19653b92ff6a5660f83c04f09a973a Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Tue, 29 Jul 2025 01:00:37 +0000 Subject: [PATCH 11/70] Fix intel packet math header inclusion order --- Eigen/Core | 53 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 46e6f7294..cc003b075 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -192,36 +192,51 @@ using std::ptrdiff_t; #include "src/Core/arch/Default/BFloat16.h" #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" -#if defined EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/Reductions.h" -#include "src/Core/arch/SSE/Complex.h" -#include "src/Core/arch/SSE/TypeCasting.h" -#include "src/Core/arch/SSE/MathFunctions.h" -#endif - -#if defined EIGEN_VECTORIZE_AVX #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/Reductions.h" -#include "src/Core/arch/AVX/Complex.h" -#include "src/Core/arch/AVX/TypeCasting.h" -#include "src/Core/arch/AVX/MathFunctions.h" -#endif - -#if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/Reductions.h" -#include "src/Core/arch/AVX512/Complex.h" -#include "src/Core/arch/AVX512/TypeCasting.h" -#include "src/Core/arch/AVX512/MathFunctions.h" -#include "src/Core/arch/AVX512/TrsmKernel.h" -#endif - #if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/PacketMathFP16.h" +#endif +#include "src/Core/arch/SSE/TypeCasting.h" +#include "src/Core/arch/AVX/TypeCasting.h" +#include "src/Core/arch/AVX512/TypeCasting.h" +#if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/TypeCastingFP16.h" +#endif +#include "src/Core/arch/SSE/Complex.h" +#include "src/Core/arch/AVX/Complex.h" +#include "src/Core/arch/AVX512/Complex.h" +#include "src/Core/arch/SSE/MathFunctions.h" +#include "src/Core/arch/AVX/MathFunctions.h" +#include "src/Core/arch/AVX512/MathFunctions.h" +#if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/MathFunctionsFP16.h" #endif +#include "src/Core/arch/AVX512/TrsmKernel.h" +#elif defined EIGEN_VECTORIZE_AVX +// Use AVX for floats and doubles, SSE for integers +#include "src/Core/arch/SSE/PacketMath.h" +#include "src/Core/arch/SSE/Reductions.h" +#include "src/Core/arch/SSE/TypeCasting.h" +#include "src/Core/arch/SSE/Complex.h" +#include "src/Core/arch/AVX/PacketMath.h" +#include "src/Core/arch/AVX/Reductions.h" +#include "src/Core/arch/AVX/TypeCasting.h" +#include "src/Core/arch/AVX/Complex.h" +#include "src/Core/arch/SSE/MathFunctions.h" +#include "src/Core/arch/AVX/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_SSE +#include "src/Core/arch/SSE/PacketMath.h" +#include "src/Core/arch/SSE/Reductions.h" +#include "src/Core/arch/SSE/TypeCasting.h" +#include "src/Core/arch/SSE/MathFunctions.h" +#include "src/Core/arch/SSE/Complex.h" +#endif #if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" From e4493233e800d2756783f0d5aaabb87227193922 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 31 Jul 2025 17:02:43 +0000 Subject: [PATCH 12/70] Fix EIGEN_OPTIMIZATION_BARRIER for clang-cl --- Eigen/src/Core/util/Macros.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 00d55577d..919855597 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -993,8 +993,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons #endif #if !defined(EIGEN_OPTIMIZATION_BARRIER) -#if EIGEN_COMP_GNUC - // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: +// Implement the barrier on GNUC compilers or clang-cl. +#if EIGEN_COMP_GNUC || (defined(__clang__) && defined(_MSC_VER)) +// According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: // X: Any operand whatsoever. // r: A register operand is allowed provided that it is in a general // register. @@ -1027,37 +1028,37 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, // you will need to apply to the underlying POD type. #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT - // This seems to be broken on clang. Packet4f is loaded into a single +// This seems to be broken on clang. Packet4f is loaded into a single // register rather than a vector, zeroing out some entries. Integer // types also generate a compile error. #if EIGEN_OS_MAC - // General, Altivec for Apple (VSX were added in ISA v2.06): +// General, Altivec for Apple (VSX were added in ISA v2.06): #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v"(X)); #else - // General, Altivec, VSX otherwise: +// General, Altivec, VSX otherwise: #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v,wa"(X)); #endif #elif EIGEN_ARCH_ARM_OR_ARM64 #ifdef __ARM_FP - // General, VFP or NEON. +// General, VFP or NEON. // Clang doesn't like "r", // error: non-trivial scalar-to-vector conversion, possible invalid // constraint for vector typ #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,w"(X)); #else - // Arm without VFP or NEON. +// Arm without VFP or NEON. // "w" constraint will not compile. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g"(X)); #endif #elif EIGEN_ARCH_i386_OR_x86_64 - // General, SSE. +// General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,x"(X)); #else - // Not implemented for other architectures. +// Not implemented for other architectures. #define EIGEN_OPTIMIZATION_BARRIER(X) #endif #else - // Not implemented for other compilers. +// Not implemented for other compilers. #define EIGEN_OPTIMIZATION_BARRIER(X) #endif #endif From edcf4c135f75e1de9fddc536adaa7b8578addc47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 4 Aug 2025 19:11:43 +0000 Subject: [PATCH 13/70] Remove fortran dependency for eigenblas. --- blas/CMakeLists.txt | 19 +--------------- blas/fortran/complexdots.f | 43 ------------------------------------- blas/testing/CMakeLists.txt | 12 +++++++++++ 3 files changed, 13 insertions(+), 61 deletions(-) delete mode 100644 blas/fortran/complexdots.f diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 4ae060302..4f189925a 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -2,14 +2,6 @@ project(EigenBlas CXX) if(EIGEN_BUILD_BLAS) -include(CheckLanguage) -check_language(Fortran) -if(CMAKE_Fortran_COMPILER) - enable_language(Fortran) - set(EIGEN_Fortran_COMPILER_WORKS ON) -else() - set(EIGEN_Fortran_COMPILER_WORKS OFF) -endif() add_custom_target(blas) @@ -18,15 +10,9 @@ set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c - f2c/ztbmv.c + f2c/ztbmv.c f2c/complexdots.c ) -if (EIGEN_Fortran_COMPILER_WORKS) - set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f) -else() - set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c) -endif() - set(EIGEN_BLAS_TARGETS "") add_library(eigen_blas_static ${EigenBlas_SRCS}) @@ -49,8 +35,6 @@ foreach(target IN LISTS EIGEN_BLAS_TARGETS) ARCHIVE DESTINATION lib) endforeach() -if(EIGEN_Fortran_COMPILER_WORKS) - if(EIGEN_BUILD_TESTING) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(testing) # can't do EXCLUDE_FROM_ALL here, breaks CTest @@ -60,4 +44,3 @@ if(EIGEN_BUILD_TESTING) endif() endif() -endif() diff --git a/blas/fortran/complexdots.f b/blas/fortran/complexdots.f deleted file mode 100644 index a7da51d16..000000000 --- a/blas/fortran/complexdots.f +++ /dev/null @@ -1,43 +0,0 @@ - COMPLEX FUNCTION CDOTC(N,CX,INCX,CY,INCY) - INTEGER INCX,INCY,N - COMPLEX CX(*),CY(*) - COMPLEX RES - EXTERNAL CDOTCW - - CALL CDOTCW(N,CX,INCX,CY,INCY,RES) - CDOTC = RES - RETURN - END - - COMPLEX FUNCTION CDOTU(N,CX,INCX,CY,INCY) - INTEGER INCX,INCY,N - COMPLEX CX(*),CY(*) - COMPLEX RES - EXTERNAL CDOTUW - - CALL CDOTUW(N,CX,INCX,CY,INCY,RES) - CDOTU = RES - RETURN - END - - DOUBLE COMPLEX FUNCTION ZDOTC(N,CX,INCX,CY,INCY) - INTEGER INCX,INCY,N - DOUBLE COMPLEX CX(*),CY(*) - DOUBLE COMPLEX RES - EXTERNAL ZDOTCW - - CALL ZDOTCW(N,CX,INCX,CY,INCY,RES) - ZDOTC = RES - RETURN - END - - DOUBLE COMPLEX FUNCTION ZDOTU(N,CX,INCX,CY,INCY) - INTEGER INCX,INCY,N - DOUBLE COMPLEX CX(*),CY(*) - DOUBLE COMPLEX RES - EXTERNAL ZDOTUW - - CALL ZDOTUW(N,CX,INCX,CY,INCY,RES) - ZDOTU = RES - RETURN - END diff --git a/blas/testing/CMakeLists.txt b/blas/testing/CMakeLists.txt index 52c23acda..f0ee6a487 100644 --- a/blas/testing/CMakeLists.txt +++ b/blas/testing/CMakeLists.txt @@ -1,3 +1,13 @@ +include(CheckLanguage) +check_language(Fortran) +if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + set(EIGEN_Fortran_COMPILER_WORKS ON) +else() + set(EIGEN_Fortran_COMPILER_WORKS OFF) +endif() + +if (EIGEN_Fortran_COMPILER_WORKS) macro(ei_add_blas_test testname) @@ -38,3 +48,5 @@ ei_add_blas_test(zblat3) # add_custom_target(level1) # add_dependencies(level1 sblat1) +endif() + From 4be7e6b4e0a82853e853c0c7c4ef72f395e1f497 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 4 Aug 2025 20:56:24 +0000 Subject: [PATCH 14/70] Fix pcmp_* for HVX to comply with the new definition of true = Scalar(1) --- Eigen/src/Core/arch/HVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index ccba96efd..b9080d988 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -401,7 +401,7 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { template EIGEN_STRONG_INLINE HVXPacket pcmp_le_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); } @@ -420,7 +420,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_eq_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -439,7 +439,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -458,7 +458,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_or_nan_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } From 975a5aba4fafbc07624e6e930407a9caaf2b4089 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 6 Aug 2025 19:00:08 +0000 Subject: [PATCH 15/70] Fix TODO: Use std::bit_cast or __builtin_bit_cast if available. --- Eigen/src/Core/NumTraits.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 5e4e5c2ff..b93faae26 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -95,9 +95,19 @@ struct default_max_digits10_impl // Integer } // end namespace internal namespace numext { -/** \internal bit-wise cast without changing the underlying bit representation. */ -// TODO: Replace by std::bit_cast (available in C++20) +/** \internal bit-wise cast without changing the underlying bit representation. */ +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) { return std::bit_cast(src); } +#elif EIGEN_HAS_BUILTIN(__builtin_bit_cast) +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) { + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED) + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED) + EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED) + return __builtin_bit_cast(Tgt, src); +} +#else template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { // The behaviour of memcpy is not specified for non-trivially copyable types @@ -113,6 +123,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { memcpy(static_cast(&tgt), static_cast(&staged), sizeof(Tgt)); return tgt; } +#endif } // namespace numext // clang-format off From 8b9dbcdaaf668d39ace2757f5d91860036a8e738 Mon Sep 17 00:00:00 2001 From: Tyler Veness Date: Thu, 7 Aug 2025 00:03:33 +0000 Subject: [PATCH 16/70] Fix numext::bit_cast() compilation failure in C++20 --- Eigen/src/Core/NumTraits.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index b93faae26..bf41c3bb6 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -98,7 +98,10 @@ namespace numext { /** \internal bit-wise cast without changing the underlying bit representation. */ #if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) { return std::bit_cast(src); } +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) { + return std::bit_cast(src); +} #elif EIGEN_HAS_BUILTIN(__builtin_bit_cast) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) { From ddce1d7d12076d13bac1c517609ca6b638d071f4 Mon Sep 17 00:00:00 2001 From: Artem Bishev Date: Thu, 7 Aug 2025 16:58:22 +0000 Subject: [PATCH 17/70] Fixes #2952 --- Eigen/src/Core/VectorwiseOp.h | 36 ++++++++++++++++++++++++++++++----- test/vectorwiseop.cpp | 26 +++++++++++++++++-------- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 9ccbf7d76..9e34d8c99 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -146,6 +146,22 @@ struct member_redux { const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; + +template +struct scalar_replace_zero_with_one_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x) const { + return numext::is_exactly_zero(x) ? Scalar(1) : x; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return pselect(pcmp_eq(x, pzero(x)), pset1(Scalar(1)), x); + } +}; +template +struct functor_traits> { + enum { Cost = 1, PacketAccess = packet_traits::HasCmp }; +}; + } // namespace internal /** \class VectorwiseOp @@ -624,18 +640,28 @@ class VectorwiseOp { return m_matrix / extendedTo(other.derived()); } + using Normalized_NonzeroNormType = + CwiseUnaryOp, const NormReturnType>; + using NormalizedReturnType = CwiseBinaryOp, const ExpressionTypeNestedCleaned, + const typename OppositeExtendedType::Type>; + /** \returns an expression where each column (or row) of the referenced matrix are normalized. * The referenced matrix is \b not modified. + * + * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they remain unchanged in the + * resulting expression. + * * \sa MatrixBase::normalized(), normalize() */ - EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type> - normalized() const { - return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); + EIGEN_DEVICE_FUNC NormalizedReturnType normalized() const { + return m_matrix.cwiseQuotient(extendedToOpposite(Normalized_NonzeroNormType(this->norm()))); } /** Normalize in-place each row or columns of the referenced matrix. - * \sa MatrixBase::normalize(), normalized() + * + * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they are left unchanged. + * + * \sa MatrixBase::normalized(), normalize() */ EIGEN_DEVICE_FUNC void normalize() { m_matrix = this->normalized(); } diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 6d0e5cbf8..d037bb49b 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -114,6 +114,8 @@ void vectorwiseop_matrix(const MatrixType& m) { RealColVectorType rcres; RealRowVectorType rrres; + Scalar small_scalar = (std::numeric_limits::min)(); + // test broadcast assignment m2 = m1; m2.colwise() = colvec; @@ -171,18 +173,26 @@ void vectorwiseop_matrix(const MatrixType& m) { VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum()); // test normalized - m2 = m1.colwise().normalized(); - VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized()); - m2 = m1.rowwise().normalized(); - VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized()); + m2 = m1; + m2.col(c).fill(small_scalar); + m3 = m2.colwise().normalized(); + for (Index k = 0; k < cols; ++k) VERIFY_IS_APPROX(m3.col(k), m2.col(k).normalized()); + m2 = m1; + m2.row(r).setZero(); + m3 = m2.rowwise().normalized(); + for (Index k = 0; k < rows; ++k) VERIFY_IS_APPROX(m3.row(k), m2.row(k).normalized()); // test normalize m2 = m1; - m2.colwise().normalize(); - VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized()); + m2.col(c).setZero(); + m3 = m2; + m3.colwise().normalize(); + for (Index k = 0; k < cols; ++k) VERIFY_IS_APPROX(m3.col(k), m2.col(k).normalized()); m2 = m1; - m2.rowwise().normalize(); - VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized()); + m2.row(r).fill(small_scalar); + m3 = m2; + m3.rowwise().normalize(); + for (Index k = 0; k < rows; ++k) VERIFY_IS_APPROX(m3.row(k), m2.row(k).normalized()); // test with partial reduction of products Matrix m1m1 = m1 * m1.transpose(); From 1c0048a08c07f4f42995c2f0dc142a0752ecfb0d Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Sat, 9 Aug 2025 19:32:30 +0000 Subject: [PATCH 18/70] Fix inconsistency between ptrue and pcmp_* in HVX --- Eigen/src/Core/arch/HVX/PacketMath.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index b9080d988..9b6ceb320 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -399,9 +399,26 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { return pnegate_hvx(a); } +template +EIGEN_STRONG_INLINE HVXPacket ptrue_hvx(const HVXPacket& a) { + return HVXPacket::Create(Q6_V_vsplat_R(0x3f800000)); +} +template <> +EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) { + return ptrue_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) { + return ptrue_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { + return ptrue_hvx(a); +} + template EIGEN_STRONG_INLINE HVXPacket pcmp_le_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = ptrue(a).Get(); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); } @@ -420,7 +437,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_eq_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = ptrue(a).Get(); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -439,7 +456,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = ptrue(a).Get(); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -458,7 +475,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_or_nan_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = ptrue(a).Get(); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } From e15cd620a06eab392fc6fa98cb54e427cedfdbd2 Mon Sep 17 00:00:00 2001 From: Artem Bishev Date: Sun, 10 Aug 2025 17:44:09 +0000 Subject: [PATCH 19/70] Remove select class --- Eigen/src/Core/CoreEvaluators.h | 44 ----------- Eigen/src/Core/Select.h | 92 ++++------------------- Eigen/src/Core/util/ForwardDeclarations.h | 2 - test/array_cwise.cpp | 2 +- test/array_for_matrix.cpp | 2 +- test/evaluators.cpp | 2 +- 6 files changed, 17 insertions(+), 127 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index ec731acd5..c9b2d2d28 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1565,50 +1565,6 @@ struct block_evaluator -struct evaluator> - : evaluator_base> { - typedef Select XprType; - enum { - CoeffReadCost = evaluator::CoeffReadCost + - plain_enum_max(evaluator::CoeffReadCost, evaluator::CoeffReadCost), - - Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, - - Alignment = plain_enum_min(evaluator::Alignment, evaluator::Alignment) - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& select) - : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) { - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - if (m_conditionImpl.coeff(row, col)) - return m_thenImpl.coeff(row, col); - else - return m_elseImpl.coeff(row, col); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (m_conditionImpl.coeff(index)) - return m_thenImpl.coeff(index); - else - return m_elseImpl.coeff(index); - } - - protected: - evaluator m_conditionImpl; - evaluator m_thenImpl; - evaluator m_elseImpl; -}; - // -------------------- Replicate -------------------- template diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h index 0fa5f1e17..61a67c2f7 100644 --- a/Eigen/src/Core/Select.h +++ b/Eigen/src/Core/Select.h @@ -15,7 +15,7 @@ namespace Eigen { -/** \class Select +/** \typedef Select * \ingroup Core_Module * * \brief Expression of a coefficient wise version of the C++ ternary operator ?: @@ -24,73 +24,16 @@ namespace Eigen { * \tparam ThenMatrixType the type of the \em then expression * \tparam ElseMatrixType the type of the \em else expression * - * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:. + * This type represents an expression of a coefficient wise version of the C++ ternary operator ?:. * It is the return type of DenseBase::select() and most of the time this is the only way it is used. * * \sa DenseBase::select(const DenseBase&, const DenseBase&) const */ - -namespace internal { template -struct traits > : traits { - typedef typename traits::Scalar Scalar; - typedef Dense StorageKind; - typedef typename traits::XprKind XprKind; - typedef typename ConditionMatrixType::Nested ConditionMatrixNested; - typedef typename ThenMatrixType::Nested ThenMatrixNested; - typedef typename ElseMatrixType::Nested ElseMatrixNested; - enum { - RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime, - ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime, - MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime, - MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime, - Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit - }; -}; -} // namespace internal - -template -class Select : public internal::dense_xpr_base >::type, - internal::no_assignment_operator { - public: - typedef typename internal::dense_xpr_base