From 5e144bbaa454eb2af7f6751376051fe16d143276 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 24 Jan 2017 13:32:50 -0800
Subject: [PATCH] Make NaN propagatation consistent between the pmax/pmin and
 std::max/std::min. This makes the NaN propagation consistent between the
 scalar and vectorized code paths of Eigen's scalar_max_op and scalar_min_op.

See #1373 for details.
---
 Eigen/src/Core/MathFunctionsImpl.h     |  7 +--
 Eigen/src/Core/arch/AVX/PacketMath.h   | 24 +++++++---
 Eigen/src/Core/arch/SSE/PacketMath.h   | 66 +++++++++++++++++++++++---
 unsupported/test/cxx11_tensor_expr.cpp | 46 ++++++++++++++++++
 4 files changed, 123 insertions(+), 20 deletions(-)
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 3c9ef22fa..cdbd14e8c 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -29,12 +29,7 @@ T generic_fast_tanh_float(const T& a_x)
   // this range is +/-1.0f in single-precision.
   const T plus_9 = pset1<T>(9.f);
   const T minus_9 = pset1<T>(-9.f);
-  // NOTE GCC prior to 6.3 might improperly optimize this max/min
-  //      step such that if a_x is nan, x will be either 9 or -9,
-  //      and tanh will return 1 or -1 instead of nan.
-  //      This is supposed to be fixed in gcc6.3,
-  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  const T x = pmax(minus_9,pmin(plus_9,a_x));
+  const T x =  pmax(pmin(a_x, plus_9), minus_9);
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(4.89352455891786e-03f);
   const T alpha_3 = pset1<T>(6.37261928875436e-04f);
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 195d40fb4..20d067c6a 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -183,12 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
-
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_ps(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_pd(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_ps(b,a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_pd(b,a);
+}
 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
 
@@ -225,7 +235,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
 //   return _mm256_unpacklo_ps(tmp,tmp);
-  
+
   // _mm256_insertf128_ps is very slow on Haswell, thus:
   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3832de147..03c8a2c13 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -45,7 +45,7 @@ struct eigen_packet_wrapper
     m_val = v;
     return *this;
   }
-  
+
   T m_val;
 };
 typedef eigen_packet_wrapper<__m128>  Packet4f;
@@ -69,7 +69,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
 
 #define vec2d_swizzle1(v,p,q) \
   (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-  
+
 #define vec4f_swizzle2(a,b,p,q,r,s) \
   (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
 
@@ -190,7 +190,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
   return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
 }
 #endif
-  
+
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
@@ -250,8 +250,34 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f&
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet4f res = b;
+  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet2d res = b;
+  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -263,8 +289,34 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet4f res = b;
+  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet2d res = b;
+  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67..129b4e659 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -300,6 +300,51 @@ static void test_select()
   }
 }
 
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+  for (int size = 1; size < 17; ++size) {
+    const Scalar kNan = std::numeric_limits<Scalar>::quiet_NaN();
+    Tensor<Scalar, 1> vec_nan(size);
+    Tensor<Scalar, 1> vec_zero(size);
+    Tensor<Scalar, 1> vec_res(size);
+    vec_nan.setConstant(kNan);
+    vec_zero.setZero();
+    vec_res.setZero();
+
+    // Test that we propagate NaNs in the tensor when applying the
+    // cwiseMax(scalar) operator, which is used for the Relu operator.
+    vec_res = vec_nan.cwiseMax(Scalar(0));
+    for (int i = 0; i < size; ++i) {
+      VERIFY((numext::isnan)(vec_res(i)));
+    }
+
+    // Test that NaNs do not propagate if we reverse the arguments.
+    vec_res = vec_zero.cwiseMax(kNan);
+    for (int i = 0; i < size; ++i) {
+      VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
+    }
+
+    // Test that we propagate NaNs in the tensor when applying the
+    // cwiseMin(scalar) operator.
+    vec_res.setZero();
+    vec_res = vec_nan.cwiseMin(Scalar(0));
+    for (int i = 0; i < size; ++i) {
+      VERIFY((numext::isnan)(vec_res(i)));
+    }
+
+    // Test that NaNs do not propagate if we reverse the arguments.
+    vec_res = vec_zero.cwiseMin(kNan);
+    for (int i = 0; i < size; ++i) {
+      VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
+    }
+  }
+}
+
+static void test_minmax_nan_propagation()
+{
+  test_minmax_nan_propagation_templ<float>();
+  test_minmax_nan_propagation_templ<double>();
+}
 
 void test_cxx11_tensor_expr()
 {
@@ -311,4 +356,5 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_minmax_nan_propagation());
 }