Turnon the new thread pool by default since it scales much better over multiple cores. It is still possible to revert to the old thread pool by compiling with the EIGEN_USE_SIMPLE_THREAD_POOL define.

2025-09-11 08:53:14 +08:00 · 2016-05-13 17:23:15 -07:00 · 2016-05-13 17:23:15 -07:00 · 83dfb40f66
commit 83dfb40f66
parent 97605c7b27
3 changed files with 94 additions and 6 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -15,7 +15,7 @@

 namespace Eigen {

-#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
 namespace internal {

 template<typename LhsScalar, typename LhsMapper, typename Index>
@ -54,7 +54,7 @@ struct packRhsAndKernelArg {
 };

 }  // end namespace internal
-#endif  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL

 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  TensorEvaluator(const XprType& op, const Device& device) :
      Base(op, device) {}

-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
            bool rhs_inner_dim_reordered, int Alignment>
  void evalProduct(Scalar* buffer) const {
@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    return 0;
  }

-#else  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL

  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
  void evalProduct(Scalar* buffer) const {
@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      }
    }
  }
-#endif  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL

  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                               bool shard_by_col, bool prepacked) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -14,7 +14,7 @@ namespace Eigen {

 // Use the SimpleThreadPool by default. We'll switch to the new non blocking
 // thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
 template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
 typedef NonBlockingThreadPool ThreadPool;
 #else
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
  }
 }

+#ifdef EIGEN_HAS_CUDA_FP16
+/*
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output, half2* scratch) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = thread_id;
+    for (; i < num_preserved_coeffs; i += 2*num_threads) {
+      ((half2*)output)[i] = reducer.initializePacket();
+    }
+    if (i + 1 < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      half2 reduced_val1 = reducer.initializePacket();
+      half2 reduced_val2 = reducer.initializePacket();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+            // do the same for reduce val2 here
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1);
+            const half2 val = __halves2half2(last, reducer.initialize());
+            reducer.reducePacket(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val);
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        if (row + 1 < num_preserved_coeffs) {
+          atomicReduce(&(output[row]), reduced_val, reducer);
+        }
+        else {
+          atomicReduce(scratch, reduced_val, reducer);
+        }
+      }
+    }
+  }
+}
+*/
+#endif
+
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,