Do not reduce parallelism too much in contractions with small number of threads

2025-10-09 22:56:31 +08:00 · 2019-02-04 12:59:33 -08:00 · 2019-02-04 12:59:33 -08:00 · 8491127082
commit 8491127082
parent eb21bab769
1 changed files with 12 additions and 3 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -339,10 +339,19 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      // If there is enough available parallelism in sharding dimension we can
      // call kernels in sync mode and use thread local memory for packed data.
      const Index sharding_dim_tasks = shard_by_col ? nn : nm;
-      if (!parallel_pack_ && sharding_dim_tasks >= device_.numThreadsInPool()) {
-        parallelize_by_sharding_dim_only_ = true;

-        int num_worker_threads = device_.numThreadsInPool();
+      const int num_worker_threads = device_.numThreadsInPool();
+
+      // With small number of threads we want to make sure that we do not reduce
+      // parallelism too much.
+      const int oversharding_factor =
+          num_worker_threads <= 4  ? 8 :
+          num_worker_threads <= 8  ? 4 :
+          num_worker_threads <= 16 ? 2 : 1;
+
+      if (!parallel_pack_ &&
+          sharding_dim_tasks >= oversharding_factor * num_worker_threads) {
+        parallelize_by_sharding_dim_only_ = true;

        if (shard_by_col) {
          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];