Fix MSVC arm build.

(cherry picked from commit 0a5392d6061134a4a32d0025fa154f830b83d606)
2025-10-17 18:41:31 +08:00 · 2023-02-08 21:46:37 +00:00 · 2023-02-08 21:46:37 +00:00 · 879854382c
commit 879854382c
parent 90dce8dfa3
3 changed files with 244 additions and 209 deletions
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -129,12 +129,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa

 template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
 {
-  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }

--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -57,6 +57,16 @@ typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
 typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
 typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;

+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return vld1q_f32(from);
+}
+
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
+  float from[2] = {a, b};
+  return vld1_f32(from);
+}
+
 #else

 typedef float32x2_t                          Packet2f;
@ -78,11 +88,14 @@ typedef uint32x4_t                           Packet4ui;
 typedef int64x2_t                            Packet2l;
 typedef uint64x2_t                           Packet2ul;

+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return {a, b, c, d}; }
+EIGEN_ALWAYS_INLINE Packet4f make_packet2f(float a, float b) { return {a, b}; }
+
 #endif // EIGEN_COMP_MSVC_STRICT

 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
  const float* a = reinterpret_cast<const float*>(&m);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
  return res;
 }

@ -95,7 +108,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int
 {
  const float* a = reinterpret_cast<const float*>(&m);
  const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
  return res;
 }

@ -104,7 +117,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n
 {
  const float* a = reinterpret_cast<const float*>(&m);
  const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
  return res;
 }

@ -146,7 +159,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)

-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
  // prefetch instructions there are too detailed for __builtin_prefetch to map
  // meaningfully to them.
@ -862,12 +875,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, con

 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
 template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
-  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
  return padd(a, pxor(mask, b));
 }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
  return padd(a, pxor(mask, b));
 }

@ -2499,7 +2512,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
 { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
@ -2513,7 +2526,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
 {
  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
@ -2527,7 +2540,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
 {
  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
@ -2563,11 +2576,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
 { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
 { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
 template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
 { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
 template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
@ -3388,7 +3401,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
 {
  // See the scalar implemention in BFloat16.h for a comprehensible explanation
  // of this fast rounding algorithm
-  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));

  // lsb = (input >> 16) & 1
  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
@ -3413,7 +3426,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)

 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
 {
-  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
 }

 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
@ -3421,21 +3434,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
 }

 template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
-  return pset1<Packet4us>(from.value);
+  return Packet4bf(pset1<Packet4us>(from.value));
 }

 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
-  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
 {
-  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
 {
-  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }

 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
@ -3450,7 +3463,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet

 template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
 {
-  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }

 template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
@ -3497,25 +3510,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
 }

 template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
-  return por<Packet4us>(a, b);
+  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
-  return pxor<Packet4us>(a, b);
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
-  return pand<Packet4us>(a, b);
+  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
-  return pandnot<Packet4us>(a, b);
+  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
                                                      const Packet4bf& b)
 {
-  return pselect<Packet4us>(mask, a, b);
+  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
 }

 template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
@ -3554,13 +3567,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, con
 template<>
 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
 {
-  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
 }

 template<>
 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
 {
-  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
 }

 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
@ -3585,7 +3598,7 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a

 template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
 {
-  return preverse<Packet4us>(a);
+  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
@ -3620,7 +3633,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a,

 template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 {
-  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
 }

 //---------- double ----------
@ -3638,17 +3651,34 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)

 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG

+#if EIGEN_COMP_GNUC
 // Bug 907: workaround missing declarations of the following two functions in the ADK
 // Defining these functions as templates ensures that if these intrinsics are
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
+// This doesn't work with MSVC though, since the function names are macros.
 template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }

 template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+#endif

+#if EIGEN_COMP_MSVC_STRICT
+typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
+typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return vld1q_f64(from);
+}
+
+#else
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;

+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return {a, b}; }
+#endif
+
+
 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
 // for fast inversion of matrices of size 4.
@ -3656,7 +3686,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m
 {
  const double* a = reinterpret_cast<const double*>(&m);
  const double* b = reinterpret_cast<const double*>(&n);
-  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
  return res;
 }

@ -3747,7 +3777,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
-  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
  return padd(a, pxor(mask, b));
 }

@ -3862,7 +3892,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
 #endif

 // min
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@ -15,6 +15,113 @@ namespace Eigen {

 namespace internal {

+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return Packet2f(vreinterpret_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return Packet2f(vreinterpret_f32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return Packet4f(vreinterpretq_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return Packet4f(vreinterpretq_f32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return Packet8c(preinterpret<Packet8c>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return Packet16c(vreinterpretq_s8_u8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return Packet8uc(vreinterpret_u8_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return Packet16uc(vreinterpretq_u8_s8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return Packet4s(vreinterpret_s16_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return Packet8s(vreinterpretq_s16_u16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return Packet4us(vreinterpret_u16_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return Packet8us(vreinterpretq_u16_s16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return Packet2i(vreinterpret_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return Packet2i(vreinterpret_s32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return Packet4i(vreinterpretq_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return Packet4i(vreinterpretq_s32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return Packet2ui(vreinterpret_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return Packet2ui(vreinterpret_u32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return Packet4ui(vreinterpretq_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return Packet4ui(vreinterpretq_u32_s32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return Packet2l(vreinterpretq_s64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return Packet2ul(vreinterpretq_u64_s64(a));
+}
+
 //==============================================================================
 // pcast, SrcType = float
 //==============================================================================
@ -188,7 +295,7 @@ struct type_casting_traits<numext::int8_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
-  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet16c, Packet2l>(a));
 }

 template <>
@ -212,11 +319,11 @@ struct type_casting_traits<numext::int8_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
-  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet16c, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
-  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet8c, Packet2i>(a));
 }

 template <>
@ -240,11 +347,11 @@ struct type_casting_traits<numext::int8_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
-  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+  return preinterpret<Packet8us>(pcast<Packet16c, Packet8s>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
-  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+  return preinterpret<Packet4us>(pcast<Packet8c, Packet4s>(a));
 }

 template <>
@ -270,11 +377,11 @@ struct type_casting_traits<numext::int8_t, numext::uint8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
+  return preinterpret<Packet16uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
+  return preinterpret<Packet8uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
@ -315,7 +422,7 @@ struct type_casting_traits<numext::uint8_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
-  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet16uc, Packet2ul>(a));
 }

 template <>
@ -339,11 +446,11 @@ struct type_casting_traits<numext::uint8_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
-  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet16uc, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
-  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet8uc, Packet2ui>(a));
 }

 template <>
@ -367,11 +474,11 @@ struct type_casting_traits<numext::uint8_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
-  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+  return preinterpret<Packet8s>(pcast<Packet16uc, Packet8us>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
-  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+  return preinterpret<Packet4s>(pcast<Packet8uc, Packet4us>(a));
 }

 template <>
@ -397,11 +504,11 @@ struct type_casting_traits<numext::uint8_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
+  return preinterpret<Packet16c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
+  return preinterpret<Packet8c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
@ -442,7 +549,7 @@ struct type_casting_traits<numext::int16_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
-  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet8s, Packet2l>(a));
 }

 template <>
@ -466,11 +573,11 @@ struct type_casting_traits<numext::int16_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
-  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet8s, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
-  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
 }

 template <>
@ -492,11 +599,11 @@ struct type_casting_traits<numext::int16_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
+  return preinterpret<Packet8us>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
+  return preinterpret<Packet4us>(a);
 }

 template <>
@ -559,7 +666,7 @@ struct type_casting_traits<numext::uint16_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
-  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet8us, Packet2ul>(a));
 }

 template <>
@ -583,11 +690,11 @@ struct type_casting_traits<numext::uint16_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
-  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet8us, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
-  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
 }

 template <>
@ -609,11 +716,11 @@ struct type_casting_traits<numext::uint16_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
+  return preinterpret<Packet8s>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
+  return preinterpret<Packet4s>(a);
 }

 template <>
@ -635,11 +742,11 @@ struct type_casting_traits<numext::uint16_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
-  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+  return preinterpret<Packet16c>(pcast<Packet8us, Packet16uc>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
-  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+  return preinterpret<Packet8c>(pcast<Packet4us, Packet8uc>(a, b));
 }

 //==============================================================================
@ -674,7 +781,7 @@ struct type_casting_traits<numext::int32_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
-  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet4i, Packet2l>(a));
 }

 template <>
@ -696,11 +803,11 @@ struct type_casting_traits<numext::int32_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
+  return preinterpret<Packet4ui>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
+  return preinterpret<Packet2ui>(a);
 }

 template <>
@ -799,7 +906,7 @@ struct type_casting_traits<numext::uint32_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
-  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet4ui, Packet2ul>(a));
 }

 template <>
@ -821,11 +928,11 @@ struct type_casting_traits<numext::uint32_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
+  return preinterpret<Packet4i>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
+  return preinterpret<Packet2i>(a);
 }

 template <>
@ -847,11 +954,11 @@ struct type_casting_traits<numext::uint32_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
-  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+  return preinterpret<Packet8s>(pcast<Packet4ui, Packet8us>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
-  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+  return preinterpret<Packet4s>(pcast<Packet2ui, Packet4us>(a, b));
 }

 template <>
@ -880,12 +987,12 @@ struct type_casting_traits<numext::uint32_t, numext::int8_t> {
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                          const Packet4ui& d) {
-  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+  return preinterpret<Packet16c>(pcast<Packet4ui, Packet16uc>(a, b, c, d));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
                                                        const Packet2ui& d) {
-  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+  return preinterpret<Packet8c>(pcast<Packet2ui, Packet8uc>(a, b, c, d));
 }

 //==============================================================================
@ -915,7 +1022,7 @@ struct type_casting_traits<numext::int64_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet2ul>(a);
 }

 template <>
@ -1013,7 +1120,7 @@ struct type_casting_traits<numext::uint64_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
+  return preinterpret<Packet2l>(a);
 }

 template <>
@ -1031,7 +1138,7 @@ struct type_casting_traits<numext::uint64_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
-  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+  return preinterpret<Packet4i>(pcast<Packet2ul, Packet4ui>(a, b));
 }

 template <>
@ -1053,7 +1160,7 @@ struct type_casting_traits<numext::uint64_t, numext::int16_t> {
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                        const Packet2ul& d) {
-  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+  return preinterpret<Packet8s>(pcast<Packet2ul, Packet8us>(a, b, c, d));
 }

 template <>
@ -1077,114 +1184,7 @@ template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
                                                          const Packet2ul& g, const Packet2ul& h) {
-  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
-}
-
-//==============================================================================
-// preinterpret
-//==============================================================================
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
-  return vreinterpret_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_f32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_f32_u32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
-  return static_cast<Packet4c>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
-  return static_cast<Packet4uc>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
-  return vreinterpret_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
-  return vreinterpret_u32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_u32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet16c>(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
 }

 #if EIGEN_ARCH_ARM64
@ -1193,6 +1193,31 @@ EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l&
 // pcast/preinterpret, Double
 //==============================================================================

+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return Packet2d(vreinterpretq_f64_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return Packet2d(vreinterpretq_f64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return Packet2l(vreinterpretq_s64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return Packet2ul(vreinterpretq_u64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return Packet2d(vreinterpretq_f64_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return Packet4i(vreinterpretq_s32_f64(a));
+}
+
 template <>
 struct type_casting_traits<double, double> {
  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
@ -1314,7 +1339,9 @@ struct type_casting_traits<numext::int8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+  // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability.
+  Packet2f tmp = pcast<Packet8c, Packet2f>(vget_low_s8(a));
+  return vcvt_f64_f32(tmp);
 }

 template <>
@ -1324,7 +1351,8 @@ struct type_casting_traits<numext::uint8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+  Packet2f tmp = pcast<Packet8uc, Packet2f>(vget_low_u8(a));
+  return vcvt_f64_f32(tmp);
 }

 template <>
@ -1334,7 +1362,8 @@ struct type_casting_traits<numext::int16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+  Packet2f tmp = pcast<Packet4s, Packet2f>(vget_low_s16(a));
+  return vcvt_f64_f32(tmp);
 }

 template <>
@ -1344,7 +1373,8 @@ struct type_casting_traits<numext::uint16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+  Packet2f tmp = pcast<Packet4us, Packet2f>(vget_low_s16(a));
+  return vcvt_f64_f32(tmp);
 }

 template <>
@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
  return vcvtq_f64_u64(a);
 }

-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_f64_s64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_f64_u64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_u64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f64_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s32_f64(a);
-}
-
 #endif  // EIGEN_ARCH_ARM64

 }  // end namespace internal