Improvements to cost model.

2025-09-26 08:13:13 +08:00 · 2016-04-14 15:52:58 -07:00 · 2016-04-14 15:52:58 -07:00 · aeb5494a0b
commit aeb5494a0b
parent d2e95492e7
1 changed files with 45 additions and 12 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@ -88,6 +88,13 @@ class TensorOpCost {
           compute_cost * compute_cycles_;
  }

+  // Drop memory access component. Intended for cases when memory accesses are
+  // sequential or are completely masked by computations.
+  EIGEN_DEVICE_FUNC void dropMemoryCost() {
+    bytes_loaded_ = 0;
+    bytes_stored_ = 0;
+  }
+
  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
      const TensorOpCost& rhs) {
@ -155,24 +162,50 @@ class TensorOpCost {
 template <typename Device>
 class TensorCostModel {
 public:
-  // Costs in device cycles.
-  static const int kLoadCycles = 3;
-  static const int kStoreCycles = 3;
  // Scaling from Eigen compute cost to device cycles.
  static const int kDeviceCyclesPerComputeCycle = 1;

-  // Implements a simple "binary" policy: Return 1 if total cost is below
-  // kMinWorkToParallelize and max_threads otherwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads(
+ // Costs in device cycles.
+  static const int kStartupCycles = 100000;
+  static const int kPerThreadCycles = 100000;
+  static const int kTaskSize = 40000;
+
+  // Returns the number of threads in [1:max_threads] to use for
+  // evaluating an expression with the given output size and cost per
+  // coefficient.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
      double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
-    // Compute total cost C in device cycles.
-    const double total_cost =
-        output_size *
+    double cost = totalCost(output_size, cost_per_coeff);
+    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    return numext::mini(max_threads, numext::maxi(1, threads));
+  }
+
+  // taskSize assesses parallel task size.
+  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+  // granularity needs to be increased to mitigate parallelization overheads.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    return totalCost(output_size, cost_per_coeff) / kTaskSize;
+  }
+
+ private:
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+    // 11 is L2 cache latency on Haswell.
+    // We don't know whether data is in L1, L2 or L3. But we are most interested
+    // in single-threaded computational time around 100us-10ms (smaller time
+    // is too small for parallelization, larger time is not intersting
+    // either because we are probably using all available threads already).
+    // And for the target time range, L2 seems to be what matters. Data set
+    // fitting into L1 is too small to take noticeable time. Data set fitting
+    // only into L3 presumably will take more than 10ms to load and process.
+    const double kLoadCycles = 1.0 / 64 * 11;
+    const double kStoreCycles = 1.0 / 64 * 11;
+    // Scaling from Eigen compute cost to device cycles.
+    return output_size *
        cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
                                  kDeviceCyclesPerComputeCycle);
-    // Smallest work unit to parallelize.
-    const double kMinParallelCost = 1e6;
-    return total_cost < kMinParallelCost ? 1 : max_threads;
  }
 };