diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 42e7733d7..57de400e5 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -33,6 +33,7 @@ template<> struct packet_traits > : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -88,7 +89,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { @@ -252,7 +253,7 @@ template<> struct conj_helper template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec + // TODO optimize it for NEON Packet2cf res = conj_helper().pmul(a,b); Packet4f s, rev_s; @@ -265,11 +266,198 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +//---------- double ---------- +#ifdef __aarch64__ + +static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } + +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d v1, v2; + + // Get the real values of a + v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); + // Get the real values of a + v2 = vdupq_lane_f64(vget_high_f64(a.v), 1); + // Multiply the real a with b + v1 = vmulq_f64(v1, b.v); + // Multiply the imag a with b + v2 = vmulq_f64(v2, b.v); + // Conjugate v2 + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + // Swap real/imag elements in v2. + v2 = preverse(v2); + // Add and return the result + return Packet1cd(vaddq_f64(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, DenseIndex stride) +{ + Packet2d res; + res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); + return Packet1cd(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, DenseIndex stride) +{ + to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} + + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for NEON + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = pmul(b.v, b.v); + Packet2d rev_s = preverse(s); + + return Packet1cd(pdiv(res.v, padd(s,rev_s))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#endif // __aarch64__ } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e3a13d93a..472f7c0fe 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -34,8 +34,10 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif +typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ @@ -74,12 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet4f half; + typedef Packet2f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=0, + HasHalfPacket=1, HasDiv = 1, // FIXME check the Has* @@ -93,11 +95,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; - typedef Packet4i half; + typedef Packet2i half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + HasHalfPacket=1 // FIXME check the Has* }; }; @@ -490,16 +493,17 @@ ptranspose(PacketBlock& kernel) { #ifdef __aarch64__ typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; template<> struct packet_traits : default_packet_traits { typedef Packet2d type; - typedef Packet2d half; + typedef Packet1d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket=0, + HasHalfPacket=1, HasDiv = 1, // FIXME check the Has*