mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 19:59:05 +08:00
Replace std::vector with our own implementation, as using the stl when compiling with nvcc and avx enabled leads to many issues.
This commit is contained in:
parent
6d6413f768
commit
46177c8d64
@ -33,6 +33,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "src/Core/util/EmulateArray.h"
|
#include "src/Core/util/EmulateArray.h"
|
||||||
|
#include "src/Core/util/MaxSizeVector.h"
|
||||||
|
|
||||||
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
|
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
|
||||||
// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
|
// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
|
||||||
|
130
unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
Normal file
130
unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_FIXEDSIZEVECTOR_H
|
||||||
|
#define EIGEN_FIXEDSIZEVECTOR_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** \class MaxSizeVector
|
||||||
|
* \ingroup Core
|
||||||
|
*
|
||||||
|
* \brief The MaxSizeVector class.
|
||||||
|
*
|
||||||
|
* The %MaxSizeVector provides a subset of std::vector functionality.
|
||||||
|
*
|
||||||
|
* The goal is to provide basic std::vector operations when using
|
||||||
|
* std::vector is not an option (e.g. on GPU or when compiling using
|
||||||
|
* FMA/AVX, as this can cause either compilation failures or illegal
|
||||||
|
* instruction failures).
|
||||||
|
*
|
||||||
|
* Beware: The constructors are not API compatible with these of
|
||||||
|
* std::vector.
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
class MaxSizeVector {
|
||||||
|
public:
|
||||||
|
// Construct a new MaxSizeVector, reserve n elements.
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
explicit MaxSizeVector(size_t n)
|
||||||
|
: reserve_(n), size_(0),
|
||||||
|
data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
|
||||||
|
for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct a new MaxSizeVector, reserve and resize to n.
|
||||||
|
// Copy the init value to all elements.
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
explicit MaxSizeVector(size_t n, const T& init)
|
||||||
|
: reserve_(n), size_(n),
|
||||||
|
data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
|
||||||
|
for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
~MaxSizeVector() {
|
||||||
|
for (size_t i = 0; i < size_; ++i) {
|
||||||
|
data_[i].~T();
|
||||||
|
}
|
||||||
|
internal::aligned_free(data_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append new elements (up to reserved size).
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
void push_back(const T& t) {
|
||||||
|
eigen_assert(size_ < reserve_);
|
||||||
|
data_[size_++] = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
const T& operator[] (size_t i) const {
|
||||||
|
eigen_assert(i < size_);
|
||||||
|
return data_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
T& operator[] (size_t i) {
|
||||||
|
eigen_assert(i < size_);
|
||||||
|
return data_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
T& back() {
|
||||||
|
eigen_assert(size_ > 0);
|
||||||
|
return data_[size_ - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
const T& back() const {
|
||||||
|
eigen_assert(size_ > 0);
|
||||||
|
return data_[size_ - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
void pop_back() {
|
||||||
|
// NOTE: This does not destroy the value at the end the way
|
||||||
|
// std::vector's version of pop_back() does. That happens when
|
||||||
|
// the Vector is destroyed.
|
||||||
|
eigen_assert(size_ > 0);
|
||||||
|
size_--;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
size_t size() const { return size_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
bool empty() const { return size_ == 0; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
T* data() { return data_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
const T* data() const { return data_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
T* begin() { return data_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
T* end() { return data_ + size_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
const T* begin() const { return data_; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
const T* end() const { return data_ + size_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t reserve_;
|
||||||
|
size_t size_;
|
||||||
|
T* data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_FIXEDSIZEVECTOR_H
|
@ -28,7 +28,7 @@ struct packLhsArg {
|
|||||||
|
|
||||||
template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
|
template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
|
||||||
struct packRhsAndKernelArg {
|
struct packRhsAndKernelArg {
|
||||||
const std::vector<LhsScalar*>* blockAs;
|
const MaxSizeVector<LhsScalar*>* blockAs;
|
||||||
RhsScalar* blockB;
|
RhsScalar* blockB;
|
||||||
const RhsMapper& rhs;
|
const RhsMapper& rhs;
|
||||||
OutputMapper& output;
|
OutputMapper& output;
|
||||||
@ -46,8 +46,8 @@ struct packRhsAndKernelArg {
|
|||||||
const Index n_block_idx;
|
const Index n_block_idx;
|
||||||
const Index m_blocks;
|
const Index m_blocks;
|
||||||
const Index n_blocks;
|
const Index n_blocks;
|
||||||
std::vector<Notification*>* kernel_notifications;
|
MaxSizeVector<Notification*>* kernel_notifications;
|
||||||
const std::vector<Notification*>* lhs_notifications;
|
const MaxSizeVector<Notification*>* lhs_notifications;
|
||||||
const bool need_to_pack;
|
const bool need_to_pack;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -202,8 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// the alignment requirements with the assumption that
|
// the alignment requirements with the assumption that
|
||||||
// (Traits::mr * sizeof(ResScalar)) % 16 == 0
|
// (Traits::mr * sizeof(ResScalar)) % 16 == 0
|
||||||
const Index numBlockAs = numext::mini(num_threads, m_blocks);
|
const Index numBlockAs = numext::mini(num_threads, m_blocks);
|
||||||
std::vector<LhsScalar *> blockAs;
|
MaxSizeVector<LhsScalar *> blockAs(num_threads);
|
||||||
blockAs.reserve(num_threads);
|
|
||||||
for (int i = 0; i < num_threads; i++) {
|
for (int i = 0; i < num_threads; i++) {
|
||||||
blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
|
blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
|
||||||
}
|
}
|
||||||
@ -212,18 +211,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
|
// TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
|
||||||
// Other options: (1) reuse memory when a thread finishes. con: tricky
|
// Other options: (1) reuse memory when a thread finishes. con: tricky
|
||||||
// (2) allocate block B memory in each thread. con: overhead
|
// (2) allocate block B memory in each thread. con: overhead
|
||||||
std::vector<RhsScalar *> blockBs;
|
MaxSizeVector<RhsScalar *> blockBs(n_blocks);
|
||||||
blockBs.reserve(n_blocks);
|
|
||||||
for (int i = 0; i < n_blocks; i++) {
|
for (int i = 0; i < n_blocks; i++) {
|
||||||
blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
|
blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
|
||||||
}
|
}
|
||||||
|
|
||||||
// lhs_notifications starts with all null Notifications
|
// lhs_notifications starts with all null Notifications
|
||||||
std::vector<Notification*> lhs_notifications(num_threads, nullptr);
|
MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
|
||||||
|
|
||||||
// this should really be numBlockAs * n_blocks;
|
// this should really be numBlockAs * n_blocks;
|
||||||
const Index num_kernel_notifications = num_threads * n_blocks;
|
const Index num_kernel_notifications = num_threads * n_blocks;
|
||||||
std::vector<Notification*> kernel_notifications(num_kernel_notifications,
|
MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
|
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
|
||||||
|
@ -127,8 +127,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
|||||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||||
const Index numblocks = size / blocksize;
|
const Index numblocks = size / blocksize;
|
||||||
|
|
||||||
std::vector<Notification*> results;
|
MaxSizeVector<Notification*> results(numblocks);
|
||||||
results.reserve(numblocks);
|
|
||||||
for (int i = 0; i < numblocks; ++i) {
|
for (int i = 0; i < numblocks; ++i) {
|
||||||
results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
|
results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
|
||||||
}
|
}
|
||||||
|
@ -256,9 +256,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
|||||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||||
|
|
||||||
std::vector<Notification*> results;
|
MaxSizeVector<Notification*> results(numblocks);
|
||||||
results.reserve(numblocks);
|
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||||
std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
|
||||||
for (Index i = 0; i < numblocks; ++i) {
|
for (Index i = 0; i < numblocks; ++i) {
|
||||||
results.push_back(
|
results.push_back(
|
||||||
device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
|
device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
|
||||||
@ -308,9 +307,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
|
|||||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||||
|
|
||||||
std::vector<Notification*> results;
|
MaxSizeVector<Notification*> results(numblocks);
|
||||||
results.reserve(numblocks);
|
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||||
std::vector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
|
||||||
for (Index i = 0; i < numblocks; ++i) {
|
for (Index i = 0; i < numblocks; ++i) {
|
||||||
results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
|
results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
|
||||||
self, i * blocksize, blocksize, reducer,
|
self, i * blocksize, blocksize, reducer,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user