Fix bug #642: add vectorization of sqrt for doubles, and make sqrt really safe if EIGEN_FAST_MATH is disabled

(grafted from d4dd6aaed2c70b5e32541e96b4864b90dc07c614
 and c47010e3d221396cde2aad6b3250fa698a930e28
)
This commit is contained in:
Gael Guennebaud 2013-08-19 16:02:27 +02:00
parent f9149f9ba0
commit 2b50ade6ca
3 changed files with 16 additions and 4 deletions

View File

@ -442,8 +442,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
return _mm_xor_ps(y, sign_bit); return _mm_xor_ps(y, sign_bit);
} }
#if EIGEN_FAST_MATH
// This is based on Quake3's fast inverse square root. // This is based on Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/ // For detail see here: http://www.beyond3d.com/content/articles/8/
// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& _x) Packet4f psqrt<Packet4f>(const Packet4f& _x)
{ {
@ -457,6 +460,14 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
return pmul(_x,x); return pmul(_x,x);
} }
#else
template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
#endif
template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -83,7 +83,8 @@ template<> struct packet_traits<double> : default_packet_traits
size=2, size=2,
HasDiv = 1, HasDiv = 1,
HasExp = 1 HasExp = 1,
HasSqrt = 1
}; };
}; };
template<> struct packet_traits<int> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits

View File

@ -64,9 +64,9 @@ run time. However, these assertions do cost time and can thus be turned off.
\c EIGEN_DONT_ALIGN is defined. \c EIGEN_DONT_ALIGN is defined.
- \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless
alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN. alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. The only - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
optimization this currently includes is single precision sin() and cos() in the present of SSE enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
vectorization. Defined by default. Define it to 0 to disable.
- \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
correspond to the number of iterations or the number of instructions. The default is value 100. correspond to the number of iterations or the number of instructions. The default is value 100.