mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 20:56:00 +08:00
Remove block memory allocation required by removed block evaluation API
This commit is contained in:
parent
1c879eb010
commit
c9220c035f
@ -223,30 +223,14 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
|||||||
|
|
||||||
template <typename TensorBlockMapper>
|
template <typename TensorBlockMapper>
|
||||||
struct TensorExecutorTilingContext {
|
struct TensorExecutorTilingContext {
|
||||||
TensorExecutorTilingContext() : buffer(nullptr) {}
|
|
||||||
TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
|
TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
|
||||||
const TensorOpCost& b_cost, void* b_buffer,
|
const TensorOpCost& b_cost, size_t b_aligned_size)
|
||||||
size_t b_aligned_size)
|
|
||||||
: block_mapper(b_mapper),
|
: block_mapper(b_mapper),
|
||||||
cost(b_cost),
|
cost(b_cost),
|
||||||
buffer(b_buffer),
|
|
||||||
aligned_blocksize(b_aligned_size) {}
|
aligned_blocksize(b_aligned_size) {}
|
||||||
|
|
||||||
template <typename Scalar>
|
|
||||||
Scalar* GetCurrentThreadBuffer(const ThreadPoolDevice& device) const {
|
|
||||||
// ThreadPoolDevice::currentThreadId() returns -1 if called from a thread
|
|
||||||
// not in the thread pool, such as the main thread dispatching Eigen
|
|
||||||
// expressions.
|
|
||||||
const int thread_idx = device.currentThreadId();
|
|
||||||
eigen_assert(thread_idx >= -1 && thread_idx < device.numThreads());
|
|
||||||
|
|
||||||
const Index offset = aligned_blocksize * (thread_idx + 1);
|
|
||||||
return reinterpret_cast<Scalar*>(static_cast<char*>(buffer) + offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
TensorBlockMapper block_mapper; // navigate through blocks
|
TensorBlockMapper block_mapper; // navigate through blocks
|
||||||
TensorOpCost cost; // cost of computing a single block
|
TensorOpCost cost; // cost of computing a single block
|
||||||
void* buffer; // temporary buffer for blocks
|
|
||||||
size_t aligned_blocksize; // block size after memory alignment
|
size_t aligned_blocksize; // block size after memory alignment
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -254,37 +238,27 @@ struct TensorExecutorTilingContext {
|
|||||||
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
|
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
|
||||||
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
|
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
|
||||||
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
|
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
|
||||||
const ThreadPoolDevice& device, const Evaluator& evaluator,
|
const Evaluator& evaluator) {
|
||||||
bool allocate_buffer = true) {
|
|
||||||
// Query expression tree for desired block size/shape.
|
// Query expression tree for desired block size/shape.
|
||||||
const TensorBlockResourceRequirements requirements =
|
TensorBlockResourceRequirements requirements =
|
||||||
evaluator.getResourceRequirements();
|
evaluator.getResourceRequirements();
|
||||||
|
|
||||||
int num_threads = device.numThreads();
|
// Update target block size based on cost model.
|
||||||
|
|
||||||
// Estimate minimum block size based on cost.
|
|
||||||
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
|
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
|
||||||
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
|
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
|
||||||
size_t block_size = static_cast<size_t>(1.0 / taskSize);
|
requirements.size = static_cast<size_t>(1.0 / taskSize);
|
||||||
|
|
||||||
TensorBlockMapper block_mapper(
|
TensorBlockMapper block_mapper(
|
||||||
typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
|
typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
|
||||||
requirements);
|
requirements);
|
||||||
|
|
||||||
block_size = block_mapper.blockTotalSize();
|
size_t block_size = block_mapper.blockTotalSize();
|
||||||
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
||||||
const size_t aligned_blocksize =
|
const size_t aligned_blocksize =
|
||||||
align *
|
align *
|
||||||
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
|
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
|
||||||
|
|
||||||
// TODO(ezhulenev): In new block evaluation framework there is no need for
|
return {block_mapper, cost * block_size, aligned_blocksize};
|
||||||
// allocating temporary buffers, remove this after migration.
|
|
||||||
void* buf = NULL;
|
|
||||||
if (allocate_buffer) {
|
|
||||||
buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {block_mapper, cost * block_size, buf, aligned_blocksize};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
@ -393,8 +367,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
|||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
const TilingContext tiling =
|
const TilingContext tiling =
|
||||||
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
||||||
Vectorizable>(
|
Vectorizable>(evaluator);
|
||||||
device, evaluator, /*allocate_buffer=*/false);
|
|
||||||
|
|
||||||
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
|
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
|
||||||
IndexType lastBlockIdx) {
|
IndexType lastBlockIdx) {
|
||||||
@ -498,10 +471,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->tiling =
|
ctx->tiling = internal::GetTensorExecutorTilingContext<
|
||||||
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
|
||||||
Vectorizable>(
|
|
||||||
ctx->device, ctx->evaluator, /*allocate_buffer=*/false);
|
|
||||||
|
|
||||||
auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
|
auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
|
||||||
TensorBlockScratch scratch(ctx->device);
|
TensorBlockScratch scratch(ctx->device);
|
||||||
@ -531,7 +502,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
on_done(std::move(done)) {}
|
on_done(std::move(done)) {}
|
||||||
|
|
||||||
~TensorAsyncExecutorContext() {
|
~TensorAsyncExecutorContext() {
|
||||||
device.deallocate(tiling.buffer);
|
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
on_done();
|
on_done();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user