Use signed integers more consistently to encode the number of threads to use to evaluate a tensor expression.

2025-09-26 00:03:14 +08:00 · 2016-06-09 08:25:22 -07:00 · 2016-06-09 08:25:22 -07:00 · 14a112ee15
commit 14a112ee15
parent 8f92c26319
2 changed files with 11 additions and 11 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    // across k dimension.
    const TensorOpCost cost =
        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-    Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
        static_cast<double>(n) * m, cost, this->m_device.numThreads());

    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  class Context {
   public:
    Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
            Index gn, Index nm0, Index nn0, bool shard_by_col,
            bool parallel_pack)
@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          lhs_(lhs),
          rhs_(rhs),
          buffer_(buffer),
-          output_(buffer, m),
+          output_(buffer, tm),
          num_threads_(num_threads),
          shard_by_col_(shard_by_col),
          parallel_pack_(parallel_pack),
-          m_(m),
-          n_(n),
-          k_(k),
+          m_(tm),
+          n_(tn),
+          k_(tk),
          bm_(bm),
          bn_(bn),
          bk_(bk),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
  // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }

  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    return internal::aligned_malloc(num_bytes);
@ -130,7 +130,7 @@ struct ThreadPoolDevice {
    ::memset(buffer, c, n);
  }

-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
    return num_threads_;
  }

@ -182,7 +182,7 @@ struct ThreadPoolDevice {
                   std::function<void(Index, Index)> f) const {
    typedef TensorCostModel<ThreadPoolDevice> CostModel;
    if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, numThreads()) == 1) {
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
      f(0, n);
      return;
    }
@ -242,7 +242,7 @@ struct ThreadPoolDevice {
    // Recursively divide size into halves until we reach block_size.
    // Division code rounds mid to block_size, so we are guaranteed to get
    // block_count leaves that do actual computations.
-    Barrier barrier(block_count);
+    Barrier barrier(static_cast<unsigned int>(block_count));
    std::function<void(Index, Index)> handleRange;
    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
      if (last - first <= block_size) {
@ -268,7 +268,7 @@ struct ThreadPoolDevice {

 private:
  ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };