From 4e4dcd9026ed36c074170c13d4092eddaec5b285 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 6 Mar 2019 10:39:07 -0800 Subject: [PATCH 1/3] Remove redundant steal loop --- .../Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 49603d6c1..115e39d07 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -56,6 +56,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { thread_data_[i].thread.reset( env_.CreateThread([this, i]() { WorkerLoop(i); })); } + global_steal_partition_ = EncodePartition(0, num_threads_); #ifndef EIGEN_THREAD_LOCAL // Wait for workers to initialize per_thread_map_. Otherwise we might race // with them in Schedule or CurrentThreadId. @@ -237,6 +238,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { MaxSizeVector thread_data_; MaxSizeVector> all_coprimes_; MaxSizeVector waiters_; + unsigned global_steal_partition_; std::atomic blocked_; std::atomic spinning_; std::atomic done_; @@ -354,6 +356,9 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { Task LocalSteal() { PerThread* pt = GetPerThread(); unsigned partition = GetStealPartition(pt->thread_id); + // If thread steal partition is the same as global partition, there is no + // need to go through the steal loop twice. + if (global_steal_partition_ == partition) Task(); unsigned start, limit; DecodePartition(partition, &start, &limit); AssertBounds(start, limit); From 1bc2a0a57c5054754749dcb3235597098a208eaf Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 6 Mar 2019 10:49:49 -0800 Subject: [PATCH 2/3] Add missing return to NonBlockingThreadPool::LocalSteal --- unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 115e39d07..bd1910dcc 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -358,7 +358,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { unsigned partition = GetStealPartition(pt->thread_id); // If thread steal partition is the same as global partition, there is no // need to go through the steal loop twice. - if (global_steal_partition_ == partition) Task(); + if (global_steal_partition_ == partition) return Task(); unsigned start, limit; DecodePartition(partition, &start, &limit); AssertBounds(start, limit); From cc407c9d4d0fdc60348642b74c89f08a041cd2a2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 6 Mar 2019 11:40:06 -0800 Subject: [PATCH 3/3] Fix placement of "#if defined(EIGEN_GPUCC)" guard region. Found with -Wundefined-func-template. Author: tkoeppe@google.com --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1c44541bd..057e90e50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -317,6 +317,7 @@ class TensorExecutor class TensorExecutor { @@ -326,7 +327,6 @@ class TensorExecutor { }; -#if defined(EIGEN_GPUCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE