diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f40af7f87..a58f13ca8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -129,12 +129,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) { - const Packet2ui b = vreinterpret_u32_f32(a.v); + const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v)); return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - const Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v)); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6996cc8d3..6c2dbe458 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -57,6 +57,16 @@ typedef eigen_packet_wrapper Packet4ui; typedef eigen_packet_wrapper Packet2l; typedef eigen_packet_wrapper Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return vld1q_f32(from); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + float from[2] = {a, b}; + return vld1_f32(from); +} + #else typedef float32x2_t Packet2f; @@ -78,11 +88,14 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return {a, b, c, d}; } +EIGEN_ALWAYS_INLINE Packet4f make_packet2f(float a, float b) { return {a, b}; } + #endif // EIGEN_COMP_MSVC_STRICT EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))); return res; } @@ -95,7 +108,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -104,7 +117,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -146,7 +159,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC // __builtin_prefetch tends to do nothing on ARM64 compilers because the // prefetch instructions there are too detailed for __builtin_prefetch to map // meaningfully to them. @@ -862,12 +875,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, con template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { - Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + Packet2f mask = make_packet2f(numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { - Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + Packet4f mask = make_packet4f(numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } @@ -2499,7 +2512,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2513,7 +2526,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) { uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2527,7 +2540,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) { const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); @@ -2563,11 +2576,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) -{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) @@ -3388,7 +3401,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { // See the scalar implemention in BFloat16.h for a comprehensible explanation // of this fast rounding algorithm - Packet4ui input = reinterpret_cast(p); + Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p)); // lsb = (input >> 16) & 1 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); @@ -3413,7 +3426,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) { - return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); + return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16))); } EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { @@ -3421,21 +3434,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { } template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { - return pset1(from.value); + return Packet4bf(pset1(from.value)); } template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { - return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(Packet4us(from)))); } template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { - return pload(reinterpret_cast(from)); + return Packet4bf(pload(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { - return ploadu(reinterpret_cast(from)); + return Packet4bf(ploadu(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) @@ -3450,7 +3463,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) { - return ploaddup(reinterpret_cast(from)); + return Packet4bf(ploaddup(reinterpret_cast(from))); } template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { @@ -3497,25 +3510,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) } template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { - return por(a, b); + return Packet4bf(por(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { - return pxor(a, b); + return Packet4bf(pxor(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { - return pand(a, b); + return Packet4bf(pand(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { - return pandnot(a, b); + return Packet4bf(pandnot(Packet4us(a), Packet4us(b))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) { - return pselect(mask, a, b); + return Packet4bf(pselect(Packet4us(mask), Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) @@ -3554,13 +3567,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, con template<> EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) { - return pgather(reinterpret_cast(from), stride); + return Packet4bf(pgather(reinterpret_cast(from), stride)); } template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) { - pscatter(reinterpret_cast(to), from, stride); + pscatter(reinterpret_cast(to), Packet4us(from), stride); } template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) @@ -3585,7 +3598,7 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) { - return preverse(a); + return Packet4bf(preverse(Packet4us(a))); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) @@ -3620,7 +3633,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) { - return pxor(a, pset1(static_cast(0x8000))); + return Packet4bf(pxor(Packet4us(a), pset1(static_cast(0x8000)))); } //---------- double ---------- @@ -3638,17 +3651,34 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +#if EIGEN_COMP_GNUC // Bug 907: workaround missing declarations of the following two functions in the ADK // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. +// This doesn't work with MSVC though, since the function names are macros. template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } +#endif +#if EIGEN_COMP_MSVC_STRICT +typedef eigen_packet_wrapper Packet2d; +typedef eigen_packet_wrapper Packet1d; + +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} + +#else typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return {a, b}; } +#endif + + // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) // Currently used in LU/arch/InverseSize4.h to enable a shared implementation // for fast inversion of matrices of size 4. @@ -3656,7 +3686,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m { const double* a = reinterpret_cast(&m); const double* b = reinterpret_cast(&n); - Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); return res; } @@ -3747,7 +3777,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ - const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + const Packet2d mask = make_packet2d(numext::bit_cast(0x8000000000000000ull), 0.0); return padd(a, pxor(mask, b)); } @@ -3862,7 +3892,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); } #endif // min diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 54f97336e..1bc51b0b1 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -15,6 +15,113 @@ namespace Eigen { namespace internal { +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return Packet2f(vreinterpret_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return Packet2f(vreinterpret_f32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return Packet4f(vreinterpretq_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return Packet4f(vreinterpretq_f32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return Packet8c(preinterpret(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return Packet16c(vreinterpretq_s8_u8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return Packet8uc(vreinterpret_u8_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return Packet16uc(vreinterpretq_u8_s8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return Packet4s(vreinterpret_s16_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return Packet8s(vreinterpretq_s16_u16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return Packet4us(vreinterpret_u16_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return Packet8us(vreinterpretq_u16_s16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return Packet2i(vreinterpret_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return Packet2i(vreinterpret_s32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return Packet4i(vreinterpretq_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return Packet4i(vreinterpretq_s32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return Packet2ui(vreinterpret_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return Packet2ui(vreinterpret_u32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return Packet4ui(vreinterpretq_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return Packet4ui(vreinterpretq_u32_s32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return Packet2l(vreinterpretq_s64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return Packet2ul(vreinterpretq_u64_s64(a)); +} + //============================================================================== // pcast, SrcType = float //============================================================================== @@ -188,7 +295,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -212,11 +319,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -240,11 +347,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { - return vreinterpretq_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { - return vreinterpret_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -270,11 +377,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { - return vreinterpretq_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { - return vreinterpret_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { @@ -315,7 +422,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -339,11 +446,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -367,11 +474,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { - return vreinterpretq_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { - return vreinterpret_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -397,11 +504,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { - return vreinterpret_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { @@ -442,7 +549,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -466,11 +573,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -492,11 +599,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { - return vreinterpretq_u16_s16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { - return vreinterpret_u16_s16(a); + return preinterpret(a); } template <> @@ -559,7 +666,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -583,11 +690,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -609,11 +716,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { - return vreinterpretq_s16_u16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { - return vreinterpret_s16_u16(a); + return preinterpret(a); } template <> @@ -635,11 +742,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { - return vreinterpretq_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { - return vreinterpret_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } //============================================================================== @@ -674,7 +781,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -696,11 +803,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { - return vreinterpretq_u32_s32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { - return vreinterpret_u32_s32(a); + return preinterpret(a); } template <> @@ -799,7 +906,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -821,11 +928,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { - return vreinterpret_s32_u32(a); + return preinterpret(a); } template <> @@ -847,11 +954,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { - return vreinterpretq_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { - return vreinterpret_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -880,12 +987,12 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, const Packet4ui& d) { - return vreinterpretq_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, const Packet2ui& d) { - return vreinterpret_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } //============================================================================== @@ -915,7 +1022,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(a); } template <> @@ -1013,7 +1120,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); + return preinterpret(a); } template <> @@ -1031,7 +1138,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { - return vreinterpretq_s32_u32(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -1053,7 +1160,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d) { - return vreinterpretq_s16_u16(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> @@ -1077,114 +1184,7 @@ template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, const Packet2ul& g, const Packet2ul& h) { - return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); -} - -//============================================================================== -// preinterpret -//============================================================================== -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { - return vreinterpret_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { - return vreinterpret_f32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return vreinterpretq_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { - return vreinterpretq_f32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { - return vreinterpret_s8_u8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { - return vreinterpret_u8_s8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { - return vreinterpretq_u8_s8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { - return vreinterpret_s16_u16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { - return vreinterpretq_s16_u16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { - return vreinterpret_u16_s16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { - return vreinterpretq_u16_s16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { - return vreinterpret_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { - return vreinterpret_s32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return vreinterpretq_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { - return vreinterpret_u32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { - return vreinterpret_u32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { - return vreinterpretq_u32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { - return vreinterpretq_u32_s32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(pcast(a, b, c, d, e, f, g, h)); } #if EIGEN_ARCH_ARM64 @@ -1193,6 +1193,31 @@ EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& // pcast/preinterpret, Double //============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return Packet2d(vreinterpretq_f64_s64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return Packet2d(vreinterpretq_f64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return Packet2l(vreinterpretq_s64_f64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return Packet2ul(vreinterpretq_u64_f64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return Packet2d(vreinterpretq_f64_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return Packet4i(vreinterpretq_s32_f64(a)); +} + template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; @@ -1314,7 +1339,9 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s8(a))); + // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability. + Packet2f tmp = pcast(vget_low_s8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1324,7 +1351,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u8(a))); + Packet2f tmp = pcast(vget_low_u8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1334,7 +1362,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1344,7 +1373,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { - return vreinterpretq_f64_s64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { - return vreinterpretq_f64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { - return vreinterpretq_s64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { - return vreinterpretq_u64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { - return vreinterpretq_f64_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { - return vreinterpretq_s32_f64(a); -} - #endif // EIGEN_ARCH_ARM64 } // end namespace internal