From 46177c8d648a27d82d34cebed7e2b5bc59d441fc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Mar 2016 16:37:27 -0800 Subject: [PATCH] Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues. --- unsupported/Eigen/CXX11/Core | 1 + .../Eigen/CXX11/src/Core/util/MaxSizeVector.h | 130 ++++++++++++++++++ .../src/Tensor/TensorContractionThreadPool.h | 16 +-- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +- 5 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index e3e2cb60c..946145f5a 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -33,6 +33,7 @@ #include #include "src/Core/util/EmulateArray.h" +#include "src/Core/util/MaxSizeVector.h" // Emulate the cxx11 functionality that we need if the compiler doesn't support it. // Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h new file mode 100644 index 000000000..551124bae --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h @@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FIXEDSIZEVECTOR_H +#define EIGEN_FIXEDSIZEVECTOR_H + +namespace Eigen { + +/** \class MaxSizeVector + * \ingroup Core + * + * \brief The MaxSizeVector class. + * + * The %MaxSizeVector provides a subset of std::vector functionality. + * + * The goal is to provide basic std::vector operations when using + * std::vector is not an option (e.g. on GPU or when compiling using + * FMA/AVX, as this can cause either compilation failures or illegal + * instruction failures). + * + * Beware: The constructors are not API compatible with these of + * std::vector. + */ +template +class MaxSizeVector { + public: + // Construct a new MaxSizeVector, reserve n elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n) + : reserve_(n), size_(0), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; } + } + + // Construct a new MaxSizeVector, reserve and resize to n. + // Copy the init value to all elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n, const T& init) + : reserve_(n), size_(n), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ~MaxSizeVector() { + for (size_t i = 0; i < size_; ++i) { + data_[i].~T(); + } + internal::aligned_free(data_); + } + + // Append new elements (up to reserved size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void push_back(const T& t) { + eigen_assert(size_ < reserve_); + data_[size_++] = t; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& operator[] (size_t i) const { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& operator[] (size_t i) { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& back() { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& back() const { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void pop_back() { + // NOTE: This does not destroy the value at the end the way + // std::vector's version of pop_back() does. That happens when + // the Vector is destroyed. + eigen_assert(size_ > 0); + size_--; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t size() const { return size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool empty() const { return size_ == 0; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* data() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* data() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* begin() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* end() { return data_ + size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* begin() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* end() const { return data_ + size_; } + + private: + size_t reserve_; + size_t size_; + T* data_; +}; + +} // namespace Eigen + +#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 02b3c6dea..9044454fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -28,7 +28,7 @@ struct packLhsArg { template struct packRhsAndKernelArg { - const std::vector* blockAs; + const MaxSizeVector* blockAs; RhsScalar* blockB; const RhsMapper& rhs; OutputMapper& output; @@ -46,8 +46,8 @@ struct packRhsAndKernelArg { const Index n_block_idx; const Index m_blocks; const Index n_blocks; - std::vector* kernel_notifications; - const std::vector* lhs_notifications; + MaxSizeVector* kernel_notifications; + const MaxSizeVector* lhs_notifications; const bool need_to_pack; }; @@ -202,8 +202,7 @@ struct TensorEvaluator blockAs; - blockAs.reserve(num_threads); + MaxSizeVector blockAs(num_threads); for (int i = 0; i < num_threads; i++) { blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); } @@ -212,18 +211,17 @@ struct TensorEvaluator blockBs; - blockBs.reserve(n_blocks); + MaxSizeVector blockBs(n_blocks); for (int i = 0; i < n_blocks; i++) { blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); } // lhs_notifications starts with all null Notifications - std::vector lhs_notifications(num_threads, nullptr); + MaxSizeVector lhs_notifications(num_threads, nullptr); // this should really be numBlockAs * n_blocks; const Index num_kernel_notifications = num_threads * n_blocks; - std::vector kernel_notifications(num_kernel_notifications, + MaxSizeVector kernel_notifications(num_kernel_notifications, nullptr); for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index fd9919829..54da77bcf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -127,8 +127,7 @@ class TensorExecutor const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - std::vector results; - results.reserve(numblocks); + MaxSizeVector results(numblocks); for (int i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 875155243..2d7fb80d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -256,9 +256,8 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector results; - results.reserve(numblocks); - std::vector shards(numblocks, reducer.initialize()); + MaxSizeVector results(numblocks); + MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back( device.enqueue(&FullReducerShard::run, self, @@ -308,9 +307,8 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - std::vector results; - results.reserve(numblocks); - std::vector shards(numblocks, reducer.initialize()); + MaxSizeVector results(numblocks); + MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { results.push_back(device.enqueue(&FullReducerShard::run, self, i * blocksize, blocksize, reducer,