From 36d9b51a44240ace201d38956b89293cb5cecd8d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jul 2010 15:16:34 +0200 Subject: [PATCH] optimize non fused MADD, and add a flatten attribute macro to enforce inlining within a function --- Eigen/src/Core/arch/SSE/Complex.h | 2 +- .../Core/products/GeneralBlockPanelKernel.h | 31 ++++++++++++++++--- Eigen/src/Core/util/Macros.h | 6 ++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index d1880294c..8184159c7 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -284,7 +284,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por (const Packet1cd& template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } -// FIXME force unaligned load, this is a temporary fix +// FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd ei_pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu(const std::complex* from) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9c9dd52f2..ee3e135ef 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -133,14 +133,34 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st computeProductBlockingSizes(k, m, n); } -// FIXME -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD -#endif #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else - #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); + + // FIXME (a bit overkill maybe ?) + + template struct ei_gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) + { + c = cj.pmadd(a,b,c); + } + }; + + template struct ei_gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = ei_padd(c,t); + } + }; + + template + EIGEN_STRONG_INLINE void ei_gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + ei_gebp_madd_selector::run(cj,a,b,c,t); + } + + #define MADD(CJ,A,B,C,T) ei_gebp_madd(CJ,A,B,C,T); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); #endif /* optimized GEneral packed Block * packed Panel product kernel @@ -170,6 +190,7 @@ struct ei_gebp_kernel typedef typename ei_meta_if::ret RhsPacket; typedef typename ei_meta_if::ret ResPacket; + EIGEN_FLATTEN_ATTRIB void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index f52e8b57c..987020e52 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -147,6 +147,12 @@ #define EIGEN_ALWAYS_INLINE_ATTRIB #endif +#if EIGEN_GNUC_AT_LEAST(4,0) +#define EIGEN_FLATTEN_ATTRIB __attribute__((flatten)) +#else +#define EIGEN_FLATTEN_ATTRIB +#endif + // EIGEN_FORCE_INLINE means "inline as much as possible" #if (defined _MSC_VER) || (defined __intel_compiler) #define EIGEN_STRONG_INLINE __forceinline