Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues.

2025-09-27 08:43:14 +08:00 · 2016-03-08 16:37:27 -08:00 · 2016-03-08 16:37:27 -08:00 · 46177c8d64
commit 46177c8d64
parent 6d6413f768
5 changed files with 143 additions and 17 deletions
--- a/unsupported/Eigen/CXX11/Core
+++ b/unsupported/Eigen/CXX11/Core
@ -33,6 +33,7 @@
 #include <vector>
 #include "src/Core/util/EmulateArray.h"
 #include "src/Core/util/MaxSizeVector.h"
 // Emulate the cxx11 functionality that we need if the compiler doesn't support it.
 // Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
--- a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
@ -0,0 +1,130 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_FIXEDSIZEVECTOR_H
 #define EIGEN_FIXEDSIZEVECTOR_H
 namespace Eigen {
 /** \class MaxSizeVector
  * \ingroup Core
  *
  * \brief The MaxSizeVector class.
  *
  * The %MaxSizeVector provides a subset of std::vector functionality.
  *
  * The goal is to provide basic std::vector operations when using
  * std::vector is not an option (e.g. on GPU or when compiling using
  * FMA/AVX, as this can cause either compilation failures or illegal
  * instruction failures).
  *
  * Beware: The constructors are not API compatible with these of
  * std::vector.
  */
 template <typename T>
 class MaxSizeVector {
 public:
  // Construct a new MaxSizeVector, reserve n elements.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  explicit MaxSizeVector(size_t n)
      : reserve_(n), size_(0),
        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
  }
  // Construct a new MaxSizeVector, reserve and resize to n.
  // Copy the init value to all elements.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  explicit MaxSizeVector(size_t n, const T& init)
      : reserve_(n), size_(n),
        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  ~MaxSizeVector() {
    for (size_t i = 0; i < size_; ++i) {
      data_[i].~T();
    }
    internal::aligned_free(data_);
  }
  // Append new elements (up to reserved size).
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void push_back(const T& t) {
    eigen_assert(size_ < reserve_);
    data_[size_++] = t;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T& operator[] (size_t i) const {
    eigen_assert(i < size_);
    return data_[i];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T& operator[] (size_t i) {
    eigen_assert(i < size_);
    return data_[i];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T& back() {
    eigen_assert(size_ > 0);
    return data_[size_ - 1];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T& back() const {
    eigen_assert(size_ > 0);
    return data_[size_ - 1];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void pop_back() {
    // NOTE: This does not destroy the value at the end the way
    // std::vector's version of pop_back() does.  That happens when
    // the Vector is destroyed.
    eigen_assert(size_ > 0);
    size_--;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  size_t size() const { return size_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  bool empty() const { return size_ == 0; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T* data() { return data_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T* data() const { return data_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T* begin() { return data_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T* end() { return data_ + size_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T* begin() const { return data_; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T* end() const { return data_ + size_; }
 private:
  size_t reserve_;
  size_t size_;
  T* data_;
 };
 }  // namespace Eigen
 #endif  // EIGEN_FIXEDSIZEVECTOR_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -28,7 +28,7 @@ struct packLhsArg {
 template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
 struct packRhsAndKernelArg {
-  const std::vector<LhsScalar*>* blockAs;
+  const MaxSizeVector<LhsScalar*>* blockAs;
  RhsScalar* blockB;
  const RhsMapper& rhs;
  OutputMapper& output;
@ -46,8 +46,8 @@ struct packRhsAndKernelArg {
  const Index n_block_idx;
  const Index m_blocks;
  const Index n_blocks;
-  std::vector<Notification*>* kernel_notifications;
+  MaxSizeVector<Notification*>* kernel_notifications;
-  const std::vector<Notification*>* lhs_notifications;
+  const MaxSizeVector<Notification*>* lhs_notifications;
  const bool need_to_pack;
 };
@ -202,8 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    //       the alignment requirements with the assumption that
    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
    const Index numBlockAs = numext::mini(num_threads, m_blocks);
-    std::vector<LhsScalar *> blockAs;
+    MaxSizeVector<LhsScalar *> blockAs(num_threads);
    blockAs.reserve(num_threads);
    for (int i = 0; i < num_threads; i++) {
      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
    }
@ -212,18 +211,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
    //       Other options: (1) reuse memory when a thread finishes. con: tricky
    //                      (2) allocate block B memory in each thread. con: overhead
-    std::vector<RhsScalar *> blockBs;
+    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
    blockBs.reserve(n_blocks);
    for (int i = 0; i < n_blocks; i++) {
      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
    }
    // lhs_notifications starts with all null Notifications
-    std::vector<Notification*> lhs_notifications(num_threads, nullptr);
+    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
    // this should really be numBlockAs * n_blocks;
    const Index num_kernel_notifications = num_threads * n_blocks;
-    std::vector<Notification*> kernel_notifications(num_kernel_notifications,
+    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
                                                    nullptr);
    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -127,8 +127,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
      const Index numblocks = size / blocksize;
-      std::vector<Notification*> results;
+      MaxSizeVector<Notification*> results(numblocks);
      results.reserve(numblocks);
      for (int i = 0; i < numblocks; ++i) {
        results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
      }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -256,9 +256,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
      const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
      eigen_assert(num_coeffs >= numblocks * blocksize);
-      std::vector<Notification*> results;
+      MaxSizeVector<Notification*> results(numblocks);
-      results.reserve(numblocks);
+      MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
      std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
      for (Index i = 0; i < numblocks; ++i) {
        results.push_back(
            device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
@ -308,9 +307,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
    eigen_assert(num_coeffs >= numblocks * blocksize);
-    std::vector<Notification*> results;
+    MaxSizeVector<Notification*> results(numblocks);
-    results.reserve(numblocks);
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
    std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
    for (Index i = 0; i < numblocks; ++i) {
      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
                                       self, i * blocksize, blocksize, reducer,