Fix arm32 float division and related bugs

(cherry picked from commit 81b48065ea673cd352d11ef9b6a3d86778ac962d) (cherry picked from commit 72f77ccb3ee2aeb3f0d7122dd1ab90d215206320)
2025-08-01 17:50:40 +08:00 · 2023-08-29 00:36:07 +00:00 · 2023-08-29 00:36:07 +00:00 · 5b20d9f326
commit 5b20d9f326
parent 5f8f69020b
1 changed files with 45 additions and 29 deletions
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -32,7 +32,7 @@ namespace internal {
 #if EIGEN_ARCH_ARM64
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #else
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
 #endif
@ -107,7 +107,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@ -173,32 +173,48 @@ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
-{
+  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
 #if EIGEN_ARCH_ARM64
  return vdivq_f32(a,b);
 #else
  Packet4f inv, restep, div;
  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
  // Newton-Raphson and vrecpsq_f32()
  inv = vrecpeq_f32(b);
  // This returns a differential, by which we will have to multiply inv to get a better
  // approximation of 1/b.
  restep = vrecpsq_f32(b, inv);
  inv = vmulq_f32(restep, inv);
  // Finally, multiply a by 1/b and get the wanted result of the division.
  div = vmulq_f32(a, inv);
  return div;
 #endif
 }
 EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
  return vreinterpretq_f32_u32(vcleq_f32(a, b));
 }
 EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
 {
  // Compute approximate reciprocal.
  float32x4_t result = vrecpeq_f32(a);
  result = vmulq_f32(vrecpsq_f32(a, result), result);
  result = vmulq_f32(vrecpsq_f32(a, result), result);
  return result;
 }
 #if EIGEN_ARCH_ARM64
 template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
 #else
 template<typename Packet>
 EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
  // avoid underflow with the following manipulation:
  // a / b = f * (a * reciprocal(f * b))
  const Packet cst_one = pset1<Packet>(1.0f);
  const Packet cst_quarter = pset1<Packet>(0.25f);
  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
  return pdiv_float_common(a, b);
 }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
  return pset1<Packet4i>(0);
@ -478,7 +494,7 @@ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
  a_hi = vget_high_s32(a);
  min = vpmin_s32(a_lo, a_hi);
  min = vpmin_s32(min, min);
-  
+
  return vget_lane_s32(min, 0);
 }
@ -595,7 +611,7 @@ template<> struct packet_traits<double>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket=0,
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@ -751,7 +767,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[0] = trn1;
  kernel.packet[1] = trn2;
 }
-#endif // EIGEN_ARCH_ARM64 
+#endif // EIGEN_ARCH_ARM64
 } // end namespace internal