From 5e144bbaa454eb2af7f6751376051fe16d143276 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 24 Jan 2017 13:32:50 -0800 Subject: [PATCH] Make NaN propagatation consistent between the pmax/pmin and std::max/std::min. This makes the NaN propagation consistent between the scalar and vectorized code paths of Eigen's scalar_max_op and scalar_min_op. See #1373 for details. --- Eigen/src/Core/MathFunctionsImpl.h | 7 +-- Eigen/src/Core/arch/AVX/PacketMath.h | 24 +++++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 66 +++++++++++++++++++++++--- unsupported/test/cxx11_tensor_expr.cpp | 46 ++++++++++++++++++ 4 files changed, 123 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 3c9ef22fa..cdbd14e8c 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -29,12 +29,7 @@ T generic_fast_tanh_float(const T& a_x) // this range is +/-1.0f in single-precision. const T plus_9 = pset1(9.f); const T minus_9 = pset1(-9.f); - // NOTE GCC prior to 6.3 might improperly optimize this max/min - // step such that if a_x is nan, x will be either 9 or -9, - // and tanh will return 1 or -1 instead of nan. - // This is supposed to be fixed in gcc6.3, - // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - const T x = pmax(minus_9,pmin(plus_9,a_x)); + const T x = pmax(pmin(a_x, plus_9), minus_9); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 195d40fb4..20d067c6a 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -183,12 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& } #endif -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } - +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_ps(a,b); +} +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_pd(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_ps(b,a); +} +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_pd(b,a); +} template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } @@ -225,7 +235,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp); - + // _mm256_insertf128_ps is very slow on Haswell, thus: Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); // mimic an "inplace" permutation of the lower 128bits using a blend diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3832de147..03c8a2c13 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -45,7 +45,7 @@ struct eigen_packet_wrapper m_val = v; return *this; } - + T m_val; }; typedef eigen_packet_wrapper<__m128> Packet4f; @@ -69,7 +69,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; #define vec2d_swizzle1(v,p,q) \ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + #define vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -190,7 +190,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } #endif - + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } @@ -250,8 +250,34 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet4f res = b; + asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet2d res = b; + asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -263,8 +289,34 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet4f res = b; + asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet2d res = b; + asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 77e24cb67..129b4e659 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -300,6 +300,51 @@ static void test_select() } } +template +void test_minmax_nan_propagation_templ() { + for (int size = 1; size < 17; ++size) { + const Scalar kNan = std::numeric_limits::quiet_NaN(); + Tensor vec_nan(size); + Tensor vec_zero(size); + Tensor vec_res(size); + vec_nan.setConstant(kNan); + vec_zero.setZero(); + vec_res.setZero(); + + // Test that we propagate NaNs in the tensor when applying the + // cwiseMax(scalar) operator, which is used for the Relu operator. + vec_res = vec_nan.cwiseMax(Scalar(0)); + for (int i = 0; i < size; ++i) { + VERIFY((numext::isnan)(vec_res(i))); + } + + // Test that NaNs do not propagate if we reverse the arguments. + vec_res = vec_zero.cwiseMax(kNan); + for (int i = 0; i < size; ++i) { + VERIFY_IS_EQUAL(vec_res(i), Scalar(0)); + } + + // Test that we propagate NaNs in the tensor when applying the + // cwiseMin(scalar) operator. + vec_res.setZero(); + vec_res = vec_nan.cwiseMin(Scalar(0)); + for (int i = 0; i < size; ++i) { + VERIFY((numext::isnan)(vec_res(i))); + } + + // Test that NaNs do not propagate if we reverse the arguments. + vec_res = vec_zero.cwiseMin(kNan); + for (int i = 0; i < size; ++i) { + VERIFY_IS_EQUAL(vec_res(i), Scalar(0)); + } + } +} + +static void test_minmax_nan_propagation() +{ + test_minmax_nan_propagation_templ(); + test_minmax_nan_propagation_templ(); +} void test_cxx11_tensor_expr() { @@ -311,4 +356,5 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); + CALL_SUBTEST(test_minmax_nan_propagation()); }