From 8dca9f97e38970b1f7fed6cb508c58d8ff39d526 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:18:51 +0100
Subject: [PATCH 1/3] Add numext::sqrt function to enable custom optimized
 implementation.

This changeset add two specializations for float/double on SSE. Those
are mostly usefull with GCC for which std::sqrt add an extra and costly
check on the result of _mm_sqrt_*. Clang does not add this burden.

In this changeset, only DenseBase::norm() makes use of it.
---
 Eigen/src/Core/Dot.h                    |  3 +--
 Eigen/src/Core/MathFunctions.h          | 20 ++++++++++++++++++--
 Eigen/src/Core/arch/SSE/MathFunctions.h | 22 ++++++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index c5040c67b..ce42854cd 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -99,8 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  EIGEN_USING_STD_MATH(sqrt)
-  return sqrt(squaredNorm());
+  return numext::sqrt(squaredNorm());
 }
 
 /** \returns an expression of the quotient of *this by its own norm.
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4d5e1acb8..1c7b28a4b 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -954,8 +954,8 @@ T (ceil)(const T& x)
   return ceil(x);
 }
 
-// Log base 2 for 32 bits positive integers.
-// Conveniently returns 0 for x==0.
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
 inline int log2(int x)
 {
   eigen_assert(x>=0);
@@ -969,6 +969,22 @@ inline int log2(int x)
   return table[(v * 0x07C4ACDDU) >> 27];
 }
 
+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like norm/normalize.
+  */
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sqrt(const T &x)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  return sqrt(x);
+}
+
 } // end namespace numext
 
 namespace internal {
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 3b8b7303f..0dd52f96e 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -518,6 +518,28 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 
 } // end namespace internal
 
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sqrt(const float &x)
+{
+  return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x)));
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sqrt(const double &x)
+{
+#if EIGEN_COMP_GNUC
+  return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x)));
+#else
+  return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x)));
+#endif
+}
+
+} // end namespace numex
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATH_FUNCTIONS_SSE_H

From 7cae8918c019feabf6c143c430d0cd82c74aeec3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:30:32 +0100
Subject: [PATCH 2/3] Fix compilation on old gcc+AVX

---
 Eigen/src/Core/arch/SSE/MathFunctions.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 0dd52f96e..74f6abc37 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -524,7 +524,7 @@ template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sqrt(const float &x)
 {
-  return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x)));
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
 }
 
 template<>
@@ -532,9 +532,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double sqrt(const double &x)
 {
 #if EIGEN_COMP_GNUC
-  return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x)));
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
-  return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x)));
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
 #endif
 }
 

From ee37eb4eed09fe35be2acc3699e80f49a44ea99a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:43:42 +0100
Subject: [PATCH 3/3] bug #977: avoid division by 0 in normalize() and
 normalized().

---
 Eigen/src/Core/Dot.h | 19 ++++++++++++++++---
 test/adjoint.cpp     |  9 +++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index ce42854cd..221fc3224 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -102,7 +102,10 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   return numext::sqrt(squaredNorm());
 }
 
-/** \returns an expression of the quotient of *this by its own norm.
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
   *
   * \only_for_vectors
   *
@@ -114,19 +117,29 @@ MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
   _Nested n(derived());
-  return n / n.norm();
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }
 
 /** Normalizes the vector, i.e. divides it by its own norm.
   *
   * \only_for_vectors
   *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
   * \sa norm(), normalized()
   */
 template<typename Derived>
 inline void MatrixBase<Derived>::normalize()
 {
-  *this /= norm();
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
 }
 
 //---------- implementation of other norms ----------
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index 3b2a53c91..b1e69c2e5 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -42,6 +42,15 @@ template<> struct adjoint_specific<false> {
     VERIFY_IS_APPROX(v1, v1.norm() * v3);
     VERIFY_IS_APPROX(v3, v1.normalized());
     VERIFY_IS_APPROX(v3.norm(), RealScalar(1));
+
+    // check null inputs
+    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
+    VERIFY( (v1*very_small).norm() == 0 );
+    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
+    v3 = v1*very_small;
+    v3.normalize();
+    VERIFY_IS_APPROX(v3, (v1*very_small));
     
     // check compatibility of dot and adjoint
     ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));