From b86641a4c20fc9709bba0d9056821689a4c6d081 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 22 Mar 2024 22:32:29 +0000
Subject: [PATCH] Add support for casting between double and int64_t for SSE
 and AVX2.

---
 Eigen/src/Core/arch/AVX/PacketMath.h         |  7 ++-
 Eigen/src/Core/arch/AVX/TypeCasting.h        | 51 ++++++++++++++++++++
 Eigen/src/Core/arch/AVX512/PacketMath.h      |  1 +
 Eigen/src/Core/arch/SSE/TypeCasting.h        | 29 +++++++++++
 Eigen/src/Core/util/ConfigureVectorization.h |  3 ++
 5 files changed, 87 insertions(+), 4 deletions(-)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index b4ded4145..8ff226de8 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -270,9 +270,7 @@ struct packet_traits<uint32_t> : default_packet_traits {
 template <>
 struct packet_traits<int64_t> : default_packet_traits {
   typedef Packet4l type;
-  // There is no half-size packet for current Packet4l.
-  // TODO: support as SSE path.
-  typedef Packet4l half;
+  typedef Packet2l half;
   enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
 };
 template <>
@@ -332,6 +330,7 @@ template <>
 struct unpacket_traits<Packet4d> {
   typedef double type;
   typedef Packet2d half;
+  typedef Packet4l integer_packet;
   enum {
     size = 4,
     alignment = Aligned32,
@@ -368,7 +367,7 @@ struct unpacket_traits<Packet8ui> {
 template <>
 struct unpacket_traits<Packet4l> {
   typedef int64_t type;
-  typedef Packet4l half;
+  typedef Packet2l half;
   enum {
     size = 4,
     alignment = Aligned32,
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 3688f8df2..f01622c5a 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -47,6 +47,13 @@ template <>
 struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
 template <>
 struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+#endif
 #endif
 
 template <>
@@ -188,6 +195,35 @@ EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui
 }
 
 #ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
+  return _mm256_cvttpd_epi64(a);
+#else
+  EIGEN_ALIGN16 double aux[4];
+  pstore(aux, a);
+  return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
+                           static_cast<int64_t>(aux[0]));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
+  return _mm256_cvtepi64_pd(a);
+#else
+  EIGEN_ALIGN16 int64_t aux[4];
+  pstore(aux, a);
+  return _mm256_set_pd(static_cast<double>(aux[3]), static_cast<double>(aux[2]), static_cast<double>(aux[1]),
+                       static_cast<double>(aux[0]));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
+  return _mm256_set_m128d(pcast<Packet2l, Packet2d>(b), pcast<Packet2l, Packet2d>(a));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
   return Packet4ul(a);
@@ -198,6 +234,21 @@ EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul&
   return Packet4l(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4d>(const Packet4d& a) {
+  return _mm256_castpd_si256(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet4l>(const Packet4l& a) {
+  return _mm256_castsi256_pd(a);
+}
+
+// truncation operations
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(const Packet4l& a) {
+  return _mm256_castsi256_si128(a);
+}
 #endif
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index b6d2d984b..ed2f189aa 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -34,6 +34,7 @@ namespace internal {
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
+// TODO(rmlarsen): Add support for Packet8l.
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
 #endif
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index cbc6d4714..42871c91b 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -37,6 +37,13 @@ template <>
 struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
 template <>
 struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+
+#ifndef EIGEN_VECTORIZE_AVX2
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+#endif
 #endif
 
 template <>
@@ -79,6 +86,18 @@ EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const
                                          (1 << 2) | (1 << 6)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return _mm_set_epi64x(_mm_cvtsd_si64(preverse(a)), _mm_cvtsd_si64(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  EIGEN_ALIGN16 int64_t aux[2];
+  pstore(aux, a);
+  return _mm_set_pd(static_cast<double>(aux[1]), static_cast<double>(aux[0]));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
   return _mm_cvtepi32_ps(a);
@@ -126,6 +145,15 @@ EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a)
   return _mm_castsi128_pd(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return _mm_castsi128_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
   return _mm_castpd_si128(a);
@@ -140,6 +168,7 @@ template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
   return Packet4i(a);
 }
+
 // Disable the following code since it's broken on too many platforms / compilers.
 // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index e692438a8..1c7217339 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -266,6 +266,9 @@
 #ifdef __AVX512BF16__
 #define EIGEN_VECTORIZE_AVX512BF16
 #endif
+#ifdef __AVX512VL__
+#define EIGEN_VECTORIZE_AVX512VL
+#endif
 #ifdef __AVX512FP16__
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512FP16