optimize non fused MADD, and add a flatten attribute macro to enforce

inlining within a function
2025-10-15 01:21:29 +08:00 · 2010-07-13 15:16:34 +02:00 · 2010-07-13 15:16:34 +02:00 · 36d9b51a44
commit 36d9b51a44
parent b72b7ab76f
3 changed files with 33 additions and 6 deletions
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@ -284,7 +284,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por    <Packet1cd>(const Packet1cd&
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }

-// FIXME force unaligned load, this is a temporary fix 
+// FIXME force unaligned load, this is a temporary fix
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -133,14 +133,34 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
 }

-// FIXME
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD
-#endif
 #ifdef EIGEN_HAS_FUSE_CJMADD
  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
-  #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
+
+  // FIXME (a bit overkill maybe ?)
+
+  template<typename CJ, typename A, typename B, typename C, typename T> struct ei_gebp_madd_selector {
+    EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
+    {
+      c = cj.pmadd(a,b,c);
+    }
+  };
+
+  template<typename CJ, typename T> struct ei_gebp_madd_selector<CJ,T,T,T,T> {
+    EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
+    {
+      t = b; t = cj.pmul(a,t); c = ei_padd(c,t);
+    }
+  };
+
+  template<typename CJ, typename A, typename B, typename C, typename T>
+  EIGEN_STRONG_INLINE void ei_gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
+  {
+    ei_gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+  }
+
+  #define MADD(CJ,A,B,C,T)  ei_gebp_madd(CJ,A,B,C,T);
+//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
 #endif

 /* optimized GEneral packed Block * packed Panel product kernel
@ -170,6 +190,7 @@ struct ei_gebp_kernel
  typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
  typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;

+  EIGEN_FLATTEN_ATTRIB
  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
  {
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -147,6 +147,12 @@
 #define EIGEN_ALWAYS_INLINE_ATTRIB
 #endif

+#if EIGEN_GNUC_AT_LEAST(4,0)
+#define EIGEN_FLATTEN_ATTRIB __attribute__((flatten))
+#else
+#define EIGEN_FLATTEN_ATTRIB
+#endif
+
 // EIGEN_FORCE_INLINE means "inline as much as possible"
 #if (defined _MSC_VER) || (defined __intel_compiler)
 #define EIGEN_STRONG_INLINE __forceinline