From a22ef7e1f3307417e8499039a293daec3d395a7f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 25 Mar 2009 18:33:36 +0000 Subject: [PATCH] for some reason passing the argument by const reference killed the perf (in the packet version of sin, cos, exp, lop), so let's pass them by value. Also, improve the perf of ei_plog by reducing dependencies. --- Eigen/src/Core/GenericPacketMath.h | 17 +++++-- .../Core/arch/SSE/TranscendentalFunctions.h | 51 +++++++++---------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 21b4fb159..8583faa4a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -63,6 +63,15 @@ template struct ei_packet_traits : ei_default_packet_traits { typedef T type; enum {size=1}; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasMin = 0, + HasMax = 0 + }; }; /** \internal \returns a + b (coeff-wise) */ @@ -172,16 +181,16 @@ template inline Packet ei_preverse(const Packet& a) ***************************/ /** \internal \returns the sin of \a a (coeff-wise) */ -template inline Packet ei_psin(const Packet& a) { return ei_sin(a); } +template inline Packet ei_psin(Packet a) { return ei_sin(a); } /** \internal \returns the cos of \a a (coeff-wise) */ -template inline Packet ei_pcos(const Packet& a) { return ei_cos(a); } +template inline Packet ei_pcos(Packet a) { return ei_cos(a); } /** \internal \returns the exp of \a a (coeff-wise) */ -template inline Packet ei_pexp(const Packet& a) { return ei_exp(a); } +template inline Packet ei_pexp(Packet a) { return ei_exp(a); } /** \internal \returns the log of \a a (coeff-wise) */ -template inline Packet ei_plog(const Packet& a) { return ei_log(a); } +template inline Packet ei_plog(Packet a) { return ei_log(a); } /*************************************************************************** * The following functions might not have to be overwritten for vectorized types diff --git a/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h b/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h index 3b6712524..0c8accc17 100644 --- a/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h +++ b/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h @@ -84,9 +84,8 @@ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1); _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4); _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375); -template<> EIGEN_DONT_INLINE Packet4f ei_plog(const Packet4f& _x) +template<> EIGEN_DONT_INLINE Packet4f ei_plog(Packet4f x) { - Packet4f x = _x; Packet4i emm0; Packet4f invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); @@ -113,26 +112,27 @@ template<> EIGEN_DONT_INLINE Packet4f ei_plog(const Packet4f& _x) e = ei_psub(e, _mm_and_ps(ei_p4f_1, mask)); x = ei_padd(x, tmp); - Packet4f z = ei_pmul(x,x); - - Packet4f y = ei_p4f_cephes_log_p0; - y = ei_pmadd(y, x, ei_p4f_cephes_log_p1); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p2); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p3); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p4); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p5); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p6); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p7); - y = ei_pmadd(y, x, ei_p4f_cephes_log_p8); - y = ei_pmul(y, x); - y = ei_pmul(y, z); + Packet4f x2 = ei_pmul(x,x); + Packet4f x3 = ei_pmul(x2,x); - y = ei_pmadd(e, ei_p4f_cephes_log_q1, y); - y = ei_psub(y, ei_pmul(z, ei_p4f_half)); - - tmp = ei_pmul(e, ei_p4f_cephes_log_q2); + Packet4f y, y1, y2; + y = ei_pmadd(ei_p4f_cephes_log_p0, x, ei_p4f_cephes_log_p1); + y1 = ei_pmadd(ei_p4f_cephes_log_p3, x, ei_p4f_cephes_log_p4); + y2 = ei_pmadd(ei_p4f_cephes_log_p6, x, ei_p4f_cephes_log_p7); + y = ei_pmadd(y , x, ei_p4f_cephes_log_p2); + y1 = ei_pmadd(y1, x, ei_p4f_cephes_log_p5); + y2 = ei_pmadd(y2, x, ei_p4f_cephes_log_p8); + y = ei_pmadd(y, x3, y1); + y = ei_pmadd(y, x3, y2); + y = ei_pmul(y, x3); + + y1 = ei_pmul(e, ei_p4f_cephes_log_q1); + tmp = ei_pmul(x2, ei_p4f_half); + y = ei_padd(y, y1); + x = ei_psub(x, tmp); + y2 = ei_pmul(e, ei_p4f_cephes_log_q2); x = ei_padd(x, y); - x = ei_padd(x, tmp); + x = ei_padd(x, y2); return _mm_or_ps(x, invalid_mask); // negative arg will be NAN } @@ -150,9 +150,8 @@ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2); _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1); _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1); -template<> EIGEN_DONT_INLINE Packet4f ei_pexp(const Packet4f& _x) +template<> EIGEN_DONT_INLINE Packet4f ei_pexp(Packet4f x) { - Packet4f x = _x; Packet4f tmp = _mm_setzero_ps(), fx; Packet4i emm0; @@ -215,16 +214,17 @@ _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005); _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003); _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002); _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI +_EIGEN_DECLARE_CONST_Packet4f(2pi, 2.*M_PI); -template<> EIGEN_DONT_INLINE Packet4f ei_psin(const Packet4f& _x) +template<> EIGEN_DONT_INLINE Packet4f ei_psin(Packet4f x) { - Packet4f x = _x; Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; Packet4i emm0, emm2; sign_bit = x; /* take the absolute value */ x = ei_pabs(x); + /* extract the sign bit (upper one) */ sign_bit = _mm_and_ps(sign_bit, ei_p4f_sign_mask); @@ -292,9 +292,8 @@ template<> EIGEN_DONT_INLINE Packet4f ei_psin(const Packet4f& _x) } /* almost the same as ei_psin */ -template<> EIGEN_DONT_INLINE Packet4f ei_pcos(const Packet4f& _x) +template<> Packet4f ei_pcos(Packet4f x) { - Packet4f x = _x; Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; Packet4i emm0, emm2;