From 2b50ade6ca9391feb79f718d270d6cf5e4cd27fb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 19 Aug 2013 16:02:27 +0200 Subject: [PATCH] Fix bug #642: add vectorization of sqrt for doubles, and make sqrt really safe if EIGEN_FAST_MATH is disabled (grafted from d4dd6aaed2c70b5e32541e96b4864b90dc07c614 and c47010e3d221396cde2aad6b3250fa698a930e28 ) --- Eigen/src/Core/arch/SSE/MathFunctions.h | 11 +++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 3 ++- doc/PreprocessorDirectives.dox | 6 +++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 3376a984e..99cbd0d95 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -442,8 +442,11 @@ Packet4f pcos(const Packet4f& _x) return _mm_xor_ps(y, sign_bit); } +#if EIGEN_FAST_MATH + // This is based on Quake3's fast inverse square root. // For detail see here: http://www.beyond3d.com/content/articles/8/ +// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly. template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& _x) { @@ -457,6 +460,14 @@ Packet4f psqrt(const Packet4f& _x) return pmul(_x,x); } +#else + +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } + +#endif + +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index e256f4bac..f85d2e06e 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -83,7 +83,8 @@ template<> struct packet_traits : default_packet_traits size=2, HasDiv = 1, - HasExp = 1 + HasExp = 1, + HasSqrt = 1 }; }; template<> struct packet_traits : default_packet_traits diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index eedd5524a..981083e96 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -64,9 +64,9 @@ run time. However, these assertions do cost time and can thus be turned off. \c EIGEN_DONT_ALIGN is defined. - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN. - - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. The only - optimization this currently includes is single precision sin() and cos() in the present of SSE - vectorization. Defined by default. + - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently + enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default. + Define it to 0 to disable. - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not correspond to the number of iterations or the number of instructions. The default is value 100.