From 2c44c401146194b9a010a1e3a4bdb5118f9f46e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 14:21:24 +0100 Subject: [PATCH] First step toward a unification of packet log implementation, currently only SSE and AVX are unified. To this end, I added the following functions: pzero, pcmp_*, pfrexp, pset1frombits functions. --- Eigen/src/Core/GenericPacketMath.h | 59 ++++++++++ Eigen/src/Core/arch/AVX/MathFunctions.h | 100 +---------------- Eigen/src/Core/arch/AVX/PacketMath.h | 26 +++++ .../arch/Default/GenericPacketMathFunctions.h | 105 ++++++++++++++++++ Eigen/src/Core/arch/SSE/MathFunctions.h | 92 ++------------- Eigen/src/Core/arch/SSE/PacketMath.h | 21 +++- 6 files changed, 220 insertions(+), 183 deletions(-) create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index da1350f1b..316b03f96 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,38 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (!b); } +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } + +/** \internal \returns zeros */ +template EIGEN_DEVICE_FUNC inline Packet +pzero(const Packet& a) { return pxor(a,a); } + +/** \internal \returns bits of \a or \b according to the input bit mask \a mask */ +template EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); +} + +/** \internal \returns a <= b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ + +/** \internal \returns a < b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet +pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */ + +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ + /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet pload(const typename unpacket_traits::type* from) { return *from; } @@ -226,6 +258,10 @@ ploadu(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -597,6 +633,29 @@ pinsertlast(const Packet& a, typename unpacket_traits::type b) return pblend(mask, pset1(b), a); } +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** \internal shift the bits by n and cast the result to the initial type, i.e.: + * return float(reinterpret_cast(a) >> n) + */ +template EIGEN_DEVICE_FUNC inline Packet +pshiftright_and_cast(Packet a, int n); + +/** Default implementation of pfrexp for float. + * It is expected to be called by implementers of template<> pfrexp, + * and the above pshiftright_and_cast function must be implemented. + */ +template EIGEN_STRONG_INLINE Packet +pfrexp_float(const Packet& a, Packet& exponent) { + const Packet cst_126f = pset1(126.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_inv_mant_mask = pset1frombits(~0x7f800000u); + exponent = psub(pshiftright_and_cast(a,23), cst_126f); + return por(pand(a, cst_inv_mant_mask), cst_half); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 6af67ce2d..134facccd 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, exp, and log functions of this file are loosely derived from +/* The sin, cos, and exp functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -29,17 +29,6 @@ inline Packet8i pshiftleft(Packet8i v, int n) #endif } -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - // Sine function // Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and // evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants @@ -110,95 +99,10 @@ psin(const Packet8f& _x) { return res; } -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); - - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); + return plog_float(_x); } // Exponential function. Works by writing "x = m*log(2) + r" where diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index c291b2acd..4b5bfebdf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -121,6 +121,11 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } + template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -199,6 +204,12 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_pd(b,a); } + +template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } + template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } @@ -363,6 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } +template<> EIGEN_STRONG_INLINE Packet8f pshiftright_and_cast(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { + return pfrexp_float(a,exponent); +} + // preduxp should be ok // FIXME: why is this ok? why isn't the simply implementation working as expected? template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h new file mode 100644 index 000000000..e384b8288 --- /dev/null +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2009-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The log function of this file initially comes from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +namespace Eigen { +namespace internal { + +// Natural logarithm +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x) { + Packet x = _x; + + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + //const Packet cst_126f = pset1(126.0f); + // The smallest non denormalized float number. + const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); + const Packet cst_minus_inf = pset1frombits( 0xff800000u); + + // Polynomial coefficients. + const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); + const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); + const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); + const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); + const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); + const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); + const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); + const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); + const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); + const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); + const Packet cst_cephes_log_q1 = pset1(-2.12194440e-4f); + const Packet cst_cephes_log_q2 = pset1(0.693359375f); + + Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x)); + Packet iszero_mask = pcmp_eq(x,pzero(x)); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y2 = pmadd(y2, x, cst_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, cst_cephes_log_q1); + tmp = pmul(x2, cst_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, cst_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask)); +} + +} // end namespace internal +} // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 4af2c6cae..cba365cc5 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,13 +8,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin, cos and exp functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ #ifndef EIGEN_MATH_FUNCTIONS_SSE_H #define EIGEN_MATH_FUNCTIONS_SSE_H +#include "../Default/GenericPacketMathFunctions.h" + namespace Eigen { namespace internal { @@ -22,85 +24,7 @@ namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED @@ -266,7 +190,7 @@ Packet4f psin(const Packet4f& _x) _EIGEN_DECLARE_CONST_Packet4i(2, 2); _EIGEN_DECLARE_CONST_Packet4i(4, 4); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u); _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); @@ -482,11 +406,11 @@ Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f88e27fd2..9959005a3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -83,7 +83,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) + const Packet4f p4f_##NAME = pset1frombits(X) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -180,6 +180,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { re template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } + // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. @@ -328,6 +333,12 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } + + #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } @@ -517,6 +528,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { + return _mm_cvtepi32_ps(_mm_srli_epi32(_mm_castps_si128(a),n)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_float(a,exponent); +} + // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template<> EIGEN_STRONG_INLINE void