Fix bug #642: add vectorization of sqrt for doubles, and make sqrt really safe if EIGEN_FAST_MATH is disabled

(grafted from d4dd6aaed2c70b5e32541e96b4864b90dc07c614 and c47010e3d221396cde2aad6b3250fa698a930e28 )
2025-07-09 14:41:49 +08:00 · 2013-08-19 16:02:27 +02:00 · 2013-08-19 16:02:27 +02:00 · 2b50ade6ca
commit 2b50ade6ca
parent f9149f9ba0
3 changed files with 16 additions and 4 deletions
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -442,8 +442,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return _mm_xor_ps(y, sign_bit);
 }
 #if EIGEN_FAST_MATH
 // This is based on Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 // It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
@ -457,6 +460,14 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
  return pmul(_x,x);
 }
 #else
 template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -83,7 +83,8 @@ template<> struct packet_traits<double> : default_packet_traits
    size=2,
    HasDiv  = 1,
-    HasExp  = 1
+    HasExp  = 1,
    HasSqrt = 1
  };
 };
 template<> struct packet_traits<int>    : default_packet_traits
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@ -64,9 +64,9 @@ run time. However, these assertions do cost time and can thus be turned off.
   \c EIGEN_DONT_ALIGN is defined.
 - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
   alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. The only
+ - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
-   optimization this currently includes is single precision sin() and cos() in the present of SSE
+   enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
-   vectorization. Defined by default. 
+   Define it to 0 to disable.
 - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
   unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
   correspond to the number of iterations or the number of instructions. The default is value 100.