mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-13 12:19:12 +08:00
working 64-bit support in PacketMath.h, Complex.h needed
This commit is contained in:
parent
0f65f2762d
commit
b508619392
@ -20,10 +20,18 @@ namespace internal {
|
|||||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef EIGEN_HAS_FUSED_MADD
|
||||||
|
#define EIGEN_HAS_FUSED_MADD 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||||
|
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||||
|
#endif
|
||||||
|
|
||||||
// FIXME NEON has 16 quad registers, but since the current register allocator
|
// FIXME NEON has 16 quad registers, but since the current register allocator
|
||||||
// is so bad, it is much better to reduce it to 8
|
// is so bad, it is much better to reduce it to 8
|
||||||
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
|
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef float32x4_t Packet4f;
|
typedef float32x4_t Packet4f;
|
||||||
@ -71,6 +79,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
size = 4,
|
size = 4,
|
||||||
|
HasHalfPacket=0,
|
||||||
|
|
||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
// FIXME check the Has*
|
// FIXME check the Has*
|
||||||
@ -136,6 +145,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||||
{
|
{
|
||||||
|
#ifndef __aarch64__
|
||||||
Packet4f inv, restep, div;
|
Packet4f inv, restep, div;
|
||||||
|
|
||||||
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
||||||
@ -154,7 +164,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
|
|||||||
div = vmulq_f32(a, inv);
|
div = vmulq_f32(a, inv);
|
||||||
|
|
||||||
return div;
|
return div;
|
||||||
|
#else
|
||||||
|
return vdivq_f32(a,b);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
|
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
|
||||||
{ eigen_assert(false && "packet integer division are not supported by NEON");
|
{ eigen_assert(false && "packet integer division are not supported by NEON");
|
||||||
return pset1<Packet4i>(0);
|
return pset1<Packet4i>(0);
|
||||||
@ -472,6 +486,164 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
|||||||
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
|
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//---------- double ----------
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
typedef float64x2_t Packet2d;
|
||||||
|
|
||||||
|
template<> struct packet_traits<double> : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet2d type;
|
||||||
|
typedef Packet2d half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
size = 2,
|
||||||
|
HasHalfPacket=0,
|
||||||
|
|
||||||
|
HasDiv = 1,
|
||||||
|
// FIXME check the Has*
|
||||||
|
HasSin = 0,
|
||||||
|
HasCos = 0,
|
||||||
|
HasLog = 0,
|
||||||
|
HasExp = 0,
|
||||||
|
HasSqrt = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
|
||||||
|
{
|
||||||
|
Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
|
||||||
|
return vaddq_f64(pset1<Packet2d>(a), countdown);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
||||||
|
|
||||||
|
// for some weird raisons, it has to be overloaded for packet of integers
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
|
||||||
|
|
||||||
|
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||||
|
{
|
||||||
|
return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||||
|
{
|
||||||
|
return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||||
|
{
|
||||||
|
return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||||
|
{
|
||||||
|
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||||
|
{
|
||||||
|
return vld1q_dup_f64(from);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
|
||||||
|
{
|
||||||
|
Packet2d res;
|
||||||
|
res = vsetq_lane_f64(from[0*stride], res, 0);
|
||||||
|
res = vsetq_lane_f64(from[1*stride], res, 1);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = vgetq_lane_f64(from, 0);
|
||||||
|
to[stride*1] = vgetq_lane_f64(from, 1);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||||
|
|
||||||
|
// FIXME only store the 2 first elements ?
|
||||||
|
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
||||||
|
{
|
||||||
|
float64x2_t trn1, trn2;
|
||||||
|
|
||||||
|
// NEON zip performs interleaving of the supplied vectors.
|
||||||
|
// We perform two interleaves in a row to acquire the transposed vector
|
||||||
|
trn1 = vzip1q_f64(vecs[0], vecs[1]);
|
||||||
|
trn2 = vzip2q_f64(vecs[0], vecs[1]);
|
||||||
|
|
||||||
|
// Do the addition of the resulting vectors
|
||||||
|
return vaddq_f64(trn1, trn2);
|
||||||
|
}
|
||||||
|
// Other reduction functions:
|
||||||
|
// mul
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
|
||||||
|
|
||||||
|
// min
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
|
||||||
|
|
||||||
|
// max
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
|
||||||
|
|
||||||
|
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
|
||||||
|
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
|
||||||
|
#define PALIGN_NEON(Offset,Type,Command) \
|
||||||
|
template<>\
|
||||||
|
struct palign_impl<Offset,Type>\
|
||||||
|
{\
|
||||||
|
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
|
||||||
|
{\
|
||||||
|
if (Offset!=0)\
|
||||||
|
first = Command(first, second, Offset);\
|
||||||
|
}\
|
||||||
|
};\
|
||||||
|
|
||||||
|
PALIGN_NEON(0,Packet2d,vextq_f64)
|
||||||
|
PALIGN_NEON(1,Packet2d,vextq_f64)
|
||||||
|
#undef PALIGN_NEON
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||||
|
float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
|
||||||
|
float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
|
||||||
|
|
||||||
|
kernel.packet[0] = trn1;
|
||||||
|
kernel.packet[1] = trn2;
|
||||||
|
}
|
||||||
|
#endif // __aarch64__
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
Loading…
x
Reference in New Issue
Block a user