Fix bug #642: add vectorization of sqrt for doubles, and make sqrt really safe if EIGEN_FAST_MATH is disabled

(grafted from d4dd6aaed2c70b5e32541e96b4864b90dc07c614
 and c47010e3d221396cde2aad6b3250fa698a930e28
)
This commit is contained in:
Gael Guennebaud 2013-08-19 16:02:27 +02:00
parent f9149f9ba0
commit 2b50ade6ca
3 changed files with 16 additions and 4 deletions

View File

@ -442,8 +442,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
return _mm_xor_ps(y, sign_bit);
}
#if EIGEN_FAST_MATH
// This is based on Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& _x)
{
@ -457,6 +460,14 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
return pmul(_x,x);
}
#else
template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
#endif
template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
} // end namespace internal
} // end namespace Eigen

View File

@ -83,7 +83,8 @@ template<> struct packet_traits<double> : default_packet_traits
size=2,
HasDiv = 1,
HasExp = 1
HasExp = 1,
HasSqrt = 1
};
};
template<> struct packet_traits<int> : default_packet_traits

View File

@ -64,9 +64,9 @@ run time. However, these assertions do cost time and can thus be turned off.
\c EIGEN_DONT_ALIGN is defined.
- \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless
alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. The only
optimization this currently includes is single precision sin() and cos() in the present of SSE
vectorization. Defined by default.
- \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
Define it to 0 to disable.
- \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
correspond to the number of iterations or the number of instructions. The default is value 100.