Improvements to parallelFor.

Move some scalar functors from TensorFunctors. to Eigen core.
2025-09-12 09:23:12 +08:00 · 2016-05-12 14:07:22 -07:00 · 2016-05-12 14:07:22 -07:00 · e55deb21c5
commit e55deb21c5
parent ae9688f313
6 changed files with 156 additions and 155 deletions
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -89,13 +89,13 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
  enum {
    Conj = NumTraits<LhsScalar>::IsComplex
  };
-  
+
  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
+
  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
+
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
@ -591,6 +591,47 @@ template<typename Scalar>
 struct functor_traits<scalar_inverse_mult_op<Scalar> >
 { enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
 /** \internal
 * \brief Template functor to compute the modulo between an array and a fixed scalar.
 */
 template <typename Scalar>
 struct scalar_mod_op {
  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
  const Scalar m_divisor;
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
 /** \internal
 * \brief Template functor to compute the modulo between two arrays.
 */
 template <typename Scalar>
 struct scalar_mod2_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
 /** \internal
 * \brief Template functor to compute the float modulo between two arrays.
 */
 template <typename Scalar>
 struct scalar_fmod_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
  operator()(const Scalar& a, const Scalar& b) const {
    return numext::fmod(a, b);
  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_fmod_op<Scalar> > {
  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
         PacketAccess = false };
 };
 } // end namespace internal
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -496,7 +496,7 @@ struct functor_traits<scalar_digamma_op<Scalar> >
    PacketAccess = packet_traits<Scalar>::HasDiGamma
  };
 };
-    
+
 /** \internal
 * \brief Template functor to compute the Riemann Zeta function of two arguments.
 * \sa class CwiseUnaryOp, Cwise::zeta()
@ -587,6 +587,33 @@ struct functor_traits<scalar_erfc_op<Scalar> >
  };
 };
 /** \internal
  * \brief Template functor to compute the sigmoid of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
  */
 template <typename T>
 struct scalar_sigmoid_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
    const T one = T(1);
    return one / (one + numext::exp(-x));
  }
  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(const Packet& x) const {
    const Packet one = pset1<Packet>(T(1));
    return pdiv(one, padd(one, pexp(pnegate(x))));
  }
 };
 template <typename T>
 struct functor_traits<scalar_sigmoid_op<T> > {
  enum {
    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
  };
 };
 /** \internal
  * \brief Template functor to compute the atan of a scalar
@ -627,7 +654,7 @@ template<typename Scalar> struct scalar_tanh_op {
    const Packet plus_9 = pset1<Packet>(9.0);
    const Packet minus_9 = pset1<Packet>(-9.0);
    const Packet x = pmax(minus_9, pmin(plus_9, _x));
-    
+
    // The monomial coefficients of the numerator polynomial (odd).
    const Packet alpha_1 = pset1<Packet>(4.89352455891786e-03);
    const Packet alpha_3 = pset1<Packet>(6.37261928875436e-04);
@ -636,16 +663,16 @@ template<typename Scalar> struct scalar_tanh_op {
    const Packet alpha_9 = pset1<Packet>(-8.60467152213735e-11);
    const Packet alpha_11 = pset1<Packet>(2.00018790482477e-13);
    const Packet alpha_13 = pset1<Packet>(-2.76076847742355e-16);
-    
+
    // The monomial coefficients of the denominator polynomial (even).
    const Packet beta_0 = pset1<Packet>(4.89352518554385e-03);
    const Packet beta_2 = pset1<Packet>(2.26843463243900e-03);
    const Packet beta_4 = pset1<Packet>(1.18534705686654e-04);
    const Packet beta_6 = pset1<Packet>(1.19825839466702e-06);
-    
+
    // Since the polynomials are odd/even, we need x^2.
    const Packet x2 = pmul(x, x);
-  
+
  // Evaluate the numerator polynomial p.
    Packet p = pmadd(x2, alpha_13, alpha_11);
    p = pmadd(x2, p, alpha_9);
@ -654,12 +681,12 @@ template<typename Scalar> struct scalar_tanh_op {
    p = pmadd(x2, p, alpha_3);
    p = pmadd(x2, p, alpha_1);
    p = pmul(x, p);
-    
+
    // Evaluate the denominator polynomial p.
    Packet q = pmadd(x2, beta_6, beta_4);
    q = pmadd(x2, q, beta_2);
    q = pmadd(x2, q, beta_0);
-    
+
    // Divide the numerator by the denominator.
    return pdiv(p, q);
  }
@ -938,7 +965,7 @@ struct scalar_sign_op<Scalar,true> {
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
 { enum {
-    Cost = 
+    Cost =
        NumTraits<Scalar>::IsComplex
        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
        : ( 3*NumTraits<Scalar>::AddCost),
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -69,6 +69,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
 #include "src/Tensor/TensorDeviceCuda.h"
@ -83,7 +84,6 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorBase.h"
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -172,67 +172,69 @@ struct ThreadPoolDevice {
    pool_->Schedule(func);
  }
-  // parallelFor executes f with [0, size) arguments in parallel and waits for
+  // parallelFor executes f with [0, n) arguments in parallel and waits for
-  // completion. Block size is choosen between min_block_size and
+  // completion. F accepts a half-open interval [first, last).
-  // 2 * min_block_size to achieve the best parallel efficiency.
+  // Block size is choosen based on the iteration cost and resulting parallel
-  // If min_block_size == -1, parallelFor uses block size of 1.
+  // efficiency. If block_align is not nullptr, it is called to round up the
-  // If hard_align > 0, block size is aligned to hard_align.
+  // block size.
-  // If soft_align > hard_align, block size is aligned to soft_align provided
+  void parallelFor(Index n, const TensorOpCost& cost,
-  // that it does not increase block size too much.
+                   std::function<Index(Index)> block_align,
  void parallelFor(Index size, Index min_block_size, Index hard_align,
                   Index soft_align,
                   std::function<void(Index, Index)> f) const {
-    if (size <= 1 || (min_block_size != -1 && size < min_block_size) ||
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
-        numThreads() == 1) {
+    if (n <= 1 || numThreads() == 1 ||
-      f(0, size);
+        CostModel::numThreads(n, cost, numThreads()) == 1) {
      f(0, n);
      return;
    }
-    Index block_size = 1;
+    // Calculate block size based on (1) the iteration cost and (2) parallel
-    Index block_count = size;
+    // efficiency. We want blocks to be not too small to mitigate
-    if (min_block_size != -1) {
+    // parallelization overheads; not too large to mitigate tail
-      // Calculate block size based on (1) estimated cost and (2) parallel
+    // effect and potential load imbalance and we also want number
-      // efficiency. We want blocks to be not too small to mitigate
+    // of blocks to be evenly dividable across threads.
-      // parallelization overheads; not too large to mitigate tail effect and
+
-      // potential load imbalance and we also want number of blocks to be evenly
+    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-      // dividable across threads.
+    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
-      min_block_size = numext::maxi<Index>(min_block_size, 1);
+    const Index max_block_size =
-      block_size = numext::mini(min_block_size, size);
+        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
-      // Upper bound on block size:
+    if (block_align) {
-      const Index max_block_size = numext::mini(min_block_size * 2, size);
+      Index new_block_size = block_align(block_size);
-      block_size = numext::mini(
+      eigen_assert(new_block_size >= block_size);
-          alignBlockSize(block_size, hard_align, soft_align), size);
+      block_size = numext::mini(n, new_block_size);
-      block_count = divup(size, block_size);
+    }
-      // Calculate parallel efficiency as fraction of total CPU time used for
+    Index block_count = divup(n, block_size);
-      // computations:
+    // Calculate parallel efficiency as fraction of total CPU time used for
-      double max_efficiency =
+    // computations:
-          static_cast<double>(block_count) /
+    double max_efficiency =
-          (divup<int>(block_count, numThreads()) * numThreads());
+        static_cast<double>(block_count) /
-      // Now try to increase block size up to max_block_size as long as it
+        (divup<int>(block_count, numThreads()) * numThreads());
-      // doesn't decrease parallel efficiency.
+    // Now try to increase block size up to max_block_size as long as it
-      for (Index prev_block_count = block_count; prev_block_count > 1;) {
+    // doesn't decrease parallel efficiency.
-        // This is the next block size that divides size into a smaller number
+    for (Index prev_block_count = block_count; prev_block_count > 1;) {
-        // of blocks than the current block_size.
+      // This is the next block size that divides size into a smaller number
-        Index coarser_block_size = divup(size, prev_block_count - 1);
+      // of blocks than the current block_size.
-        coarser_block_size =
+      Index coarser_block_size = divup(n, prev_block_count - 1);
-            alignBlockSize(coarser_block_size, hard_align, soft_align);
+      if (block_align) {
-        if (coarser_block_size > max_block_size) {
+        Index new_block_size = block_align(coarser_block_size);
-          break;  // Reached max block size. Stop.
+        eigen_assert(new_block_size >= coarser_block_size);
-        }
+        coarser_block_size = numext::mini(n, new_block_size);
-        // Recalculate parallel efficiency.
+      }
-        const Index coarser_block_count = divup(size, coarser_block_size);
+      if (coarser_block_size > max_block_size) {
-        eigen_assert(coarser_block_count < prev_block_count);
+        break;  // Reached max block size. Stop.
-        prev_block_count = coarser_block_count;
+      }
-        const double coarser_efficiency =
+      // Recalculate parallel efficiency.
-            static_cast<double>(coarser_block_count) /
+      const Index coarser_block_count = divup(n, coarser_block_size);
-            (divup<int>(coarser_block_count, numThreads()) * numThreads());
+      eigen_assert(coarser_block_count < prev_block_count);
-        if (coarser_efficiency + 0.01 >= max_efficiency) {
+      prev_block_count = coarser_block_count;
-          // Taking it.
+      const double coarser_efficiency =
-          block_size = coarser_block_size;
+          static_cast<double>(coarser_block_count) /
-          block_count = coarser_block_count;
+          (divup<int>(coarser_block_count, numThreads()) * numThreads());
-          if (max_efficiency < coarser_efficiency) {
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
-            max_efficiency = coarser_efficiency;
+        // Taking it.
-          }
+        block_size = coarser_block_size;
        block_count = coarser_block_count;
        if (max_efficiency < coarser_efficiency) {
          max_efficiency = coarser_efficiency;
        }
      }
    }
@ -251,26 +253,20 @@ struct ThreadPoolDevice {
      }
      // Split into halves and submit to the pool.
      Index mid = first + divup((last - first) / 2, block_size) * block_size;
-      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+      enqueue_func([=, &handleRange]() { handleRange(mid, last); });
-      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+      enqueue_func([=, &handleRange]() { handleRange(first, mid); });
    };
-    handleRange(0, size);
+    handleRange(0, n);
    barrier.Wait();
  }
- private:
+  // Convinience wrapper for parallelFor that does not align blocks.
-  static Index alignBlockSize(Index size, Index hard_align, Index soft_align) {
+  void parallelFor(Index n, const TensorOpCost& cost,
-    if (soft_align > hard_align && size >= 4 * soft_align) {
+                   std::function<void(Index, Index)> f) const {
-      // Align to soft_align, if it won't increase size by more than 25%.
+    parallelFor(n, cost, nullptr, std::move(f));
      return (size + soft_align - 1) & ~(soft_align - 1);
    }
    if (hard_align > 0) {
      return (size + hard_align - 1) & ~(hard_align - 1);
    }
    return size;
  }
-
+ private:
  ThreadPoolInterface* pool_;
  size_t num_threads_;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -137,6 +137,13 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
    {
      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
      const Index size = array_prod(evaluator.dimensions());
 #if defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                         EvalRange::alignBlockSize,
                         [&evaluator](Index first, Index last) {
                           EvalRange::run(&evaluator, first, last);
                         });
 #else
      size_t num_threads = device.numThreads();
 #ifdef EIGEN_USE_COST_MODEL
      if (num_threads > 1) {
@ -163,11 +170,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
        }
        barrier.Wait();
      }
 #endif  // EIGEN_USE_NONBLOCKING_THREAD_POOL
    }
    evaluator.cleanup();
  }
 };
-#endif
+#endif  // EIGEN_USE_THREADS
 // GPU: the evaluation of the expression is offloaded to a GPU.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -13,77 +13,6 @@
 namespace Eigen {
 namespace internal {
 /** \internal
 * \brief Template functor to compute the modulo between an array and a scalar.
 */
 template <typename Scalar>
 struct scalar_mod_op {
  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
  const Scalar m_divisor;
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
 /** \internal
 * \brief Template functor to compute the modulo between 2 arrays.
 */
 template <typename Scalar>
 struct scalar_mod2_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
 template <typename Scalar>
 struct scalar_fmod_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
  operator()(const Scalar& a, const Scalar& b) const {
    return numext::fmod(a, b);
  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_fmod_op<Scalar> > {
  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
         PacketAccess = false };
 };
 /** \internal
  * \brief Template functor to compute the sigmoid of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
  */
 template <typename T>
 struct scalar_sigmoid_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
    const T one = T(1);
    return one / (one + numext::exp(-x));
  }
  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(const Packet& x) const {
    const Packet one = pset1<Packet>(T(1));
    return pdiv(one, padd(one, pexp(pnegate(x))));
  }
 };
 template <typename T>
 struct functor_traits<scalar_sigmoid_op<T> > {
  enum {
    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
  };
 };
 // Standard reduction functors
 template <typename T> struct SumReducer
 {