Add support for casting between double and int64_t for SSE and AVX2.

This commit is contained in:
Rasmus Munk Larsen 2024-03-22 22:32:29 +00:00
parent d883932586
commit b86641a4c2
5 changed files with 87 additions and 4 deletions

View File

@ -270,9 +270,7 @@ struct packet_traits<uint32_t> : default_packet_traits {
template <> template <>
struct packet_traits<int64_t> : default_packet_traits { struct packet_traits<int64_t> : default_packet_traits {
typedef Packet4l type; typedef Packet4l type;
// There is no half-size packet for current Packet4l. typedef Packet2l half;
// TODO: support as SSE path.
typedef Packet4l half;
enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 }; enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
}; };
template <> template <>
@ -332,6 +330,7 @@ template <>
struct unpacket_traits<Packet4d> { struct unpacket_traits<Packet4d> {
typedef double type; typedef double type;
typedef Packet2d half; typedef Packet2d half;
typedef Packet4l integer_packet;
enum { enum {
size = 4, size = 4,
alignment = Aligned32, alignment = Aligned32,
@ -368,7 +367,7 @@ struct unpacket_traits<Packet8ui> {
template <> template <>
struct unpacket_traits<Packet4l> { struct unpacket_traits<Packet4l> {
typedef int64_t type; typedef int64_t type;
typedef Packet4l half; typedef Packet2l half;
enum { enum {
size = 4, size = 4,
alignment = Aligned32, alignment = Aligned32,

View File

@ -47,6 +47,13 @@ template <>
struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {}; struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
template <> template <>
struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {}; struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
#ifdef EIGEN_VECTORIZE_AVX2
template <>
struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
template <>
struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
#endif
#endif #endif
template <> template <>
@ -188,6 +195,35 @@ EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui
} }
#ifdef EIGEN_VECTORIZE_AVX2 #ifdef EIGEN_VECTORIZE_AVX2
template <>
EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
return _mm256_cvttpd_epi64(a);
#else
EIGEN_ALIGN16 double aux[4];
pstore(aux, a);
return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
static_cast<int64_t>(aux[0]));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
return _mm256_cvtepi64_pd(a);
#else
EIGEN_ALIGN16 int64_t aux[4];
pstore(aux, a);
return _mm256_set_pd(static_cast<double>(aux[3]), static_cast<double>(aux[2]), static_cast<double>(aux[1]),
static_cast<double>(aux[0]));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
return _mm256_set_m128d(pcast<Packet2l, Packet2d>(b), pcast<Packet2l, Packet2d>(a));
}
template <> template <>
EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) { EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
return Packet4ul(a); return Packet4ul(a);
@ -198,6 +234,21 @@ EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul&
return Packet4l(a); return Packet4l(a);
} }
template <>
EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4d>(const Packet4d& a) {
return _mm256_castpd_si256(a);
}
template <>
EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet4l>(const Packet4l& a) {
return _mm256_castsi256_pd(a);
}
// truncation operations
template <>
EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(const Packet4l& a) {
return _mm256_castsi256_si128(a);
}
#endif #endif
template <> template <>

View File

@ -34,6 +34,7 @@ namespace internal {
typedef __m512 Packet16f; typedef __m512 Packet16f;
typedef __m512i Packet16i; typedef __m512i Packet16i;
typedef __m512d Packet8d; typedef __m512d Packet8d;
// TODO(rmlarsen): Add support for Packet8l.
#ifndef EIGEN_VECTORIZE_AVX512FP16 #ifndef EIGEN_VECTORIZE_AVX512FP16
typedef eigen_packet_wrapper<__m256i, 1> Packet16h; typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
#endif #endif

View File

@ -37,6 +37,13 @@ template <>
struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {}; struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
template <> template <>
struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {}; struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
#ifndef EIGEN_VECTORIZE_AVX2
template <>
struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
template <>
struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
#endif
#endif #endif
template <> template <>
@ -79,6 +86,18 @@ EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const
(1 << 2) | (1 << 6))); (1 << 2) | (1 << 6)));
} }
template <>
EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
return _mm_set_epi64x(_mm_cvtsd_si64(preverse(a)), _mm_cvtsd_si64(a));
}
template <>
EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
EIGEN_ALIGN16 int64_t aux[2];
pstore(aux, a);
return _mm_set_pd(static_cast<double>(aux[1]), static_cast<double>(aux[0]));
}
template <> template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a); return _mm_cvtepi32_ps(a);
@ -126,6 +145,15 @@ EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a)
return _mm_castsi128_pd(a); return _mm_castsi128_pd(a);
} }
template <>
EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
return _mm_castsi128_pd(a);
}
template <>
EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
return _mm_castpd_si128(a);
}
template <> template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) { EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
return _mm_castpd_si128(a); return _mm_castpd_si128(a);
@ -140,6 +168,7 @@ template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) { EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
return Packet4i(a); return Packet4i(a);
} }
// Disable the following code since it's broken on too many platforms / compilers. // Disable the following code since it's broken on too many platforms / compilers.
// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#if 0 #if 0

View File

@ -266,6 +266,9 @@
#ifdef __AVX512BF16__ #ifdef __AVX512BF16__
#define EIGEN_VECTORIZE_AVX512BF16 #define EIGEN_VECTORIZE_AVX512BF16
#endif #endif
#ifdef __AVX512VL__
#define EIGEN_VECTORIZE_AVX512VL
#endif
#ifdef __AVX512FP16__ #ifdef __AVX512FP16__
#ifdef __AVX512VL__ #ifdef __AVX512VL__
#define EIGEN_VECTORIZE_AVX512FP16 #define EIGEN_VECTORIZE_AVX512FP16