PR 526: Speed up multiplication of small, dynamically sized matrices

The Packet16f, Packet8f and Packet8d types are too large to use with dynamically sized matrices typically processed by the SliceVectorizedTraversal specialization of the dense_assignment_loop. Using these types is likely to lead to little or no vectorization. Significant slowdown in the multiplication of these small matrices can be observed when building with AVX and AVX512 enabled. This patch introduces a new dense_assignment_kernel that is used when computing small products whose operands have dynamic dimensions. It ensures that the PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used when computing the product. I tested all 969 possible combinations of M, K, and N that are handled by the dense_assignment_loop on x86 builds. Although a few combinations are slowed down by this patch they are far outnumbered by the cases that are sped up, as the following results demonstrate. Disabling Packed8d on AVX512 builds: Total Cases: 969 Better: 511 Worse: 85 Same: 373 Max Improvement: 169.00% (4 8 6) Max Degradation: 36.50% (8 5 3) Median Improvement: 35.46% Median Degradation: 17.41% Total FLOPs Improvement: 19.42% Disabling Packet16f and Packed8f on AVX512 builds: Total Cases: 969 Better: 658 Worse: 5 Same: 306 Max Improvement: 214.05% (8 6 5) Max Degradation: 22.26% (16 2 1) Median Improvement: 60.05% Median Degradation: 13.32% Total FLOPs Improvement: 59.58% Disabling Packed8f on AVX builds: Total Cases: 969 Better: 663 Worse: 96 Same: 210 Max Improvement: 155.29% (4 10 5) Max Degradation: 35.12% (8 3 2) Median Improvement: 34.28% Median Degradation: 15.05% Total FLOPs Improvement: 26.02%
2025-09-15 02:43:14 +08:00 · 2018-10-12 15:20:21 +02:00 · 2018-10-12 15:20:21 +02:00 · aa110e681b
commit aa110e681b
parent d9392f9e55
2 changed files with 43 additions and 2 deletions
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -699,6 +699,26 @@ protected:
  DstXprType& m_dstExpr;
 };

+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+ public:
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::DstXprType DstXprType;
+    typedef typename find_best_packet<Scalar, 4>::type PacketType;
+    
+    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {
+  }
+ };
+ 
 /***************************************************************************
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/
@ -837,6 +857,27 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
  
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+    typedef evaluator<Dst> DstEvaluatorType;
+    typedef evaluator<Src> SrcEvaluatorType;
+    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
+
+    EIGEN_STATIC_ASSERT_LVALUE(Dst)
+    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+    SrcEvaluatorType srcEvaluator(src);
+    resize_if_allowed(dst, src, func);
+    
+    DstEvaluatorType dstEvaluator(dst);
+    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+    dense_assignment_loop<Kernel>::run(kernel);
+}
+
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -424,7 +424,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
  {
-    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+    call_restricted_packet_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
  }

  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
@ -433,7 +433,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
  {
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
  }