Add support for casting between double and int64_t for SSE and AVX2.

2025-09-23 23:03:15 +08:00 · 2024-03-22 22:32:29 +00:00 · 2024-03-22 22:32:29 +00:00 · b86641a4c2
commit b86641a4c2
parent d883932586
5 changed files with 87 additions and 4 deletions
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -270,9 +270,7 @@ struct packet_traits<uint32_t> : default_packet_traits {
 template <>
 struct packet_traits<int64_t> : default_packet_traits {
  typedef Packet4l type;
-  // There is no half-size packet for current Packet4l.
+  typedef Packet2l half;
  // TODO: support as SSE path.
  typedef Packet4l half;
  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
 };
 template <>
@ -332,6 +330,7 @@ template <>
 struct unpacket_traits<Packet4d> {
  typedef double type;
  typedef Packet2d half;
  typedef Packet4l integer_packet;
  enum {
    size = 4,
    alignment = Aligned32,
@ -368,7 +367,7 @@ struct unpacket_traits<Packet8ui> {
 template <>
 struct unpacket_traits<Packet4l> {
  typedef int64_t type;
-  typedef Packet4l half;
+  typedef Packet2l half;
  enum {
    size = 4,
    alignment = Aligned32,
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@ -47,6 +47,13 @@ template <>
 struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
 template <>
 struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
 #ifdef EIGEN_VECTORIZE_AVX2
 template <>
 struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
 template <>
 struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
 #endif
 #endif
 template <>
@ -188,6 +195,35 @@ EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui
 }
 #ifdef EIGEN_VECTORIZE_AVX2
 template <>
 EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
 #if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
  return _mm256_cvttpd_epi64(a);
 #else
  EIGEN_ALIGN16 double aux[4];
  pstore(aux, a);
  return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
                           static_cast<int64_t>(aux[0]));
 #endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
 #if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
  return _mm256_cvtepi64_pd(a);
 #else
  EIGEN_ALIGN16 int64_t aux[4];
  pstore(aux, a);
  return _mm256_set_pd(static_cast<double>(aux[3]), static_cast<double>(aux[2]), static_cast<double>(aux[1]),
                       static_cast<double>(aux[0]));
 #endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
  return _mm256_set_m128d(pcast<Packet2l, Packet2d>(b), pcast<Packet2l, Packet2d>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
  return Packet4ul(a);
@ -198,6 +234,21 @@ EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul&
  return Packet4l(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4d>(const Packet4d& a) {
  return _mm256_castpd_si256(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet4l>(const Packet4l& a) {
  return _mm256_castsi256_pd(a);
 }
 // truncation operations
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(const Packet4l& a) {
  return _mm256_castsi256_si128(a);
 }
 #endif
 template <>
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -34,6 +34,7 @@ namespace internal {
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
 // TODO(rmlarsen): Add support for Packet8l.
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
 #endif
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@ -37,6 +37,13 @@ template <>
 struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
 template <>
 struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
 #ifndef EIGEN_VECTORIZE_AVX2
 template <>
 struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
 template <>
 struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
 #endif
 #endif
 template <>
@ -79,6 +86,18 @@ EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const
                                         (1 << 2) | (1 << 6)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
  return _mm_set_epi64x(_mm_cvtsd_si64(preverse(a)), _mm_cvtsd_si64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
  EIGEN_ALIGN16 int64_t aux[2];
  pstore(aux, a);
  return _mm_set_pd(static_cast<double>(aux[1]), static_cast<double>(aux[0]));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
  return _mm_cvtepi32_ps(a);
@ -126,6 +145,15 @@ EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a)
  return _mm_castsi128_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
  return _mm_castsi128_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
  return _mm_castpd_si128(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
  return _mm_castpd_si128(a);
@ -140,6 +168,7 @@ template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
  return Packet4i(a);
 }
 // Disable the following code since it's broken on too many platforms / compilers.
 // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@ -266,6 +266,9 @@
 #ifdef __AVX512BF16__
 #define EIGEN_VECTORIZE_AVX512BF16
 #endif
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512VL
 #endif
 #ifdef __AVX512FP16__
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512FP16