Merged in rmlarsen/eigen (pull request PR-230)

Fix a bug in psqrt for SSE and AVX when EIGEN_FAST_MATH=1
This commit is contained in:
Gael Guennebaud 2016-10-12 16:30:51 +00:00
commit 5c366fe1d7
3 changed files with 47 additions and 45 deletions

View File

@ -355,30 +355,27 @@ pexp<Packet4d>(const Packet4d& _x) {
// Functions for sqrt. // Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but // exact solution. It does not handle +inf, or denormalized numbers correctly.
// also the fact that it can be inlined and pipelined with other computations, // The main advantage of this approach is not just speed, but also the fact that
// further reducing its effective latency. // it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
#if EIGEN_FAST_MATH #if EIGEN_FAST_MATH
template <> template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psqrt<Packet8f>(const Packet8f& _x) { psqrt<Packet8f>(const Packet8f& _x) {
_EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
_EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); Packet8f denormal_mask = _mm256_and_ps(
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
Packet8f neg_half = pmul(_x, p8f_minus_half); _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet8f non_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_GE_OQ);
Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rsqrt_ps(_x));
// Compute approximate reciprocal sqrt.
Packet8f x = _mm256_rsqrt_ps(_x);
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
// Flush results for denormals to zero.
// Multiply the original _x by it's reciprocal square root to extract the return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
// square root.
return pmul(_x, x);
} }
#else #else
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED

View File

@ -444,20 +444,28 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
#if EIGEN_FAST_MATH #if EIGEN_FAST_MATH
// This is based on Quake3's fast inverse square root. // Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/ // For detail see here: http://www.beyond3d.com/content/articles/8/
// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& _x) Packet4f psqrt<Packet4f>(const Packet4f& _x)
{ {
Packet4f half = pmul(_x, pset1<Packet4f>(.5f)); Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
Packet4f denormal_mask = _mm_and_ps(
_mm_cmpge_ps(_x, _mm_setzero_ps()),
_mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
/* select only the inverse sqrt of non-zero inputs */ // Compute approximate reciprocal sqrt.
Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())); Packet4f x = _mm_rsqrt_ps(_x);
Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); // Do a single step of Newton's iteration.
x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x)))); x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
return pmul(_x,x); // Flush results for denormals to zero.
return _mm_andnot_ps(denormal_mask, pmul(_x,x));
} }
#else #else

View File

@ -440,12 +440,9 @@ template<typename Scalar> void packetmath_real()
data1[0] = Scalar(-1.0f); data1[0] = Scalar(-1.0f);
h.store(data2, internal::plog(h.load(data1))); h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isnan)(data2[0])); VERIFY((numext::isnan)(data2[0]));
#if !EIGEN_FAST_MATH
h.store(data2, internal::psqrt(h.load(data1))); h.store(data2, internal::psqrt(h.load(data1)));
VERIFY((numext::isnan)(data2[0])); VERIFY((numext::isnan)(data2[0]));
VERIFY((numext::isnan)(data2[1])); VERIFY((numext::isnan)(data2[1]));
#endif
} }
} }