From 2b50ade6ca9391feb79f718d270d6cf5e4cd27fb Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 19 Aug 2013 16:02:27 +0200
Subject: [PATCH] Fix bug #642: add vectorization of sqrt for doubles, and make
 sqrt really safe if EIGEN_FAST_MATH is disabled (grafted from
 d4dd6aaed2c70b5e32541e96b4864b90dc07c614  and
 c47010e3d221396cde2aad6b3250fa698a930e28 )

---
 Eigen/src/Core/arch/SSE/MathFunctions.h | 11 +++++++++++
 Eigen/src/Core/arch/SSE/PacketMath.h    |  3 ++-
 doc/PreprocessorDirectives.dox          |  6 +++---
 3 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 3376a984e..99cbd0d95 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -442,8 +442,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
   return _mm_xor_ps(y, sign_bit);
 }
 
+#if EIGEN_FAST_MATH
+
 // This is based on Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
+// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
@@ -457,6 +460,14 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
   return pmul(_x,x);
 }
 
+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index e256f4bac..f85d2e06e 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -83,7 +83,8 @@ template<> struct packet_traits<double> : default_packet_traits
     size=2,
 
     HasDiv  = 1,
-    HasExp  = 1
+    HasExp  = 1,
+    HasSqrt = 1
   };
 };
 template<> struct packet_traits<int>    : default_packet_traits
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index eedd5524a..981083e96 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -64,9 +64,9 @@ run time. However, these assertions do cost time and can thus be turned off.
    \c EIGEN_DONT_ALIGN is defined.
  - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
    alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. The only
-   optimization this currently includes is single precision sin() and cos() in the present of SSE
-   vectorization. Defined by default. 
+ - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
+   enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
+   Define it to 0 to disable.
  - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
    unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
    correspond to the number of iterations or the number of instructions. The default is value 100.