From b86641a4c20fc9709bba0d9056821689a4c6d081 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 22 Mar 2024 22:32:29 +0000 Subject: [PATCH] Add support for casting between double and int64_t for SSE and AVX2. --- Eigen/src/Core/arch/AVX/PacketMath.h | 7 ++- Eigen/src/Core/arch/AVX/TypeCasting.h | 51 ++++++++++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + Eigen/src/Core/arch/SSE/TypeCasting.h | 29 +++++++++++ Eigen/src/Core/util/ConfigureVectorization.h | 3 ++ 5 files changed, 87 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index b4ded4145..8ff226de8 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -270,9 +270,7 @@ struct packet_traits : default_packet_traits { template <> struct packet_traits : default_packet_traits { typedef Packet4l type; - // There is no half-size packet for current Packet4l. - // TODO: support as SSE path. - typedef Packet4l half; + typedef Packet2l half; enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 }; }; template <> @@ -332,6 +330,7 @@ template <> struct unpacket_traits { typedef double type; typedef Packet2d half; + typedef Packet4l integer_packet; enum { size = 4, alignment = Aligned32, @@ -368,7 +367,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef int64_t type; - typedef Packet4l half; + typedef Packet2l half; enum { size = 4, alignment = Aligned32, diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 3688f8df2..f01622c5a 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -47,6 +47,13 @@ template <> struct type_casting_traits : vectorized_type_casting_traits {}; template <> struct type_casting_traits : vectorized_type_casting_traits {}; + +#ifdef EIGEN_VECTORIZE_AVX2 +template <> +struct type_casting_traits : vectorized_type_casting_traits {}; +template <> +struct type_casting_traits : vectorized_type_casting_traits {}; +#endif #endif template <> @@ -188,6 +195,35 @@ EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet8ui } #ifdef EIGEN_VECTORIZE_AVX2 +template <> +EIGEN_STRONG_INLINE Packet4l pcast(const Packet4d& a) { +#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL) + return _mm256_cvttpd_epi64(a); +#else + EIGEN_ALIGN16 double aux[4]; + pstore(aux, a); + return _mm256_set_epi64x(static_cast(aux[3]), static_cast(aux[2]), static_cast(aux[1]), + static_cast(aux[0])); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4d pcast(const Packet4l& a) { +#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL) + return _mm256_cvtepi64_pd(a); +#else + EIGEN_ALIGN16 int64_t aux[4]; + pstore(aux, a); + return _mm256_set_pd(static_cast(aux[3]), static_cast(aux[2]), static_cast(aux[1]), + static_cast(aux[0])); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4d pcast(const Packet2l& a, const Packet2l& b) { + return _mm256_set_m128d(pcast(b), pcast(a)); +} + template <> EIGEN_STRONG_INLINE Packet4ul preinterpret(const Packet4l& a) { return Packet4ul(a); @@ -198,6 +234,21 @@ EIGEN_STRONG_INLINE Packet4l preinterpret(const Packet4ul& return Packet4l(a); } +template <> +EIGEN_STRONG_INLINE Packet4l preinterpret(const Packet4d& a) { + return _mm256_castpd_si256(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4d preinterpret(const Packet4l& a) { + return _mm256_castsi256_pd(a); +} + +// truncation operations +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet4l& a) { + return _mm256_castsi256_si128(a); +} #endif template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index b6d2d984b..ed2f189aa 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -34,6 +34,7 @@ namespace internal { typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; +// TODO(rmlarsen): Add support for Packet8l. #ifndef EIGEN_VECTORIZE_AVX512FP16 typedef eigen_packet_wrapper<__m256i, 1> Packet16h; #endif diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index cbc6d4714..42871c91b 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -37,6 +37,13 @@ template <> struct type_casting_traits : vectorized_type_casting_traits {}; template <> struct type_casting_traits : vectorized_type_casting_traits {}; + +#ifndef EIGEN_VECTORIZE_AVX2 +template <> +struct type_casting_traits : vectorized_type_casting_traits {}; +template <> +struct type_casting_traits : vectorized_type_casting_traits {}; +#endif #endif template <> @@ -79,6 +86,18 @@ EIGEN_STRONG_INLINE Packet4i pcast(const Packet2d& a, const (1 << 2) | (1 << 6))); } +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { + return _mm_set_epi64x(_mm_cvtsd_si64(preverse(a)), _mm_cvtsd_si64(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { + EIGEN_ALIGN16 int64_t aux[2]; + pstore(aux, a); + return _mm_set_pd(static_cast(aux[1]), static_cast(aux[0])); +} + template <> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { return _mm_cvtepi32_ps(a); @@ -126,6 +145,15 @@ EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) return _mm_castsi128_pd(a); } +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return _mm_castsi128_pd(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return _mm_castpd_si128(a); +} + template <> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { return _mm_castpd_si128(a); @@ -140,6 +168,7 @@ template <> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { return Packet4i(a); } + // Disable the following code since it's broken on too many platforms / compilers. // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) #if 0 diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index e692438a8..1c7217339 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -266,6 +266,9 @@ #ifdef __AVX512BF16__ #define EIGEN_VECTORIZE_AVX512BF16 #endif +#ifdef __AVX512VL__ +#define EIGEN_VECTORIZE_AVX512VL +#endif #ifdef __AVX512FP16__ #ifdef __AVX512VL__ #define EIGEN_VECTORIZE_AVX512FP16