mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-23 05:14:26 +08:00
Add block evaluation V2 to TensorAsyncExecutor.
Add async evaluation to a number of ops.
This commit is contained in:
parent
668ab3fc47
commit
97c0c5d485
@ -1129,16 +1129,11 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
|
|||||||
return TensorDevice<Derived, DeviceType>(dev, derived());
|
return TensorDevice<Derived, DeviceType>(dev, derived());
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_USE_THREADS
|
|
||||||
// Select the async device on which to evaluate the expression.
|
// Select the async device on which to evaluate the expression.
|
||||||
template <typename DeviceType, typename DoneCallback>
|
template <typename DeviceType, typename DoneCallback>
|
||||||
typename internal::enable_if<
|
TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
|
||||||
internal::is_same<DeviceType, ThreadPoolDevice>::value,
|
|
||||||
TensorAsyncDevice<Derived, DeviceType, DoneCallback>>::type
|
|
||||||
device(const DeviceType& dev, DoneCallback done) {
|
|
||||||
return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
|
return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
|
||||||
}
|
}
|
||||||
#endif // EIGEN_USE_THREADS
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
|
@ -63,18 +63,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
|
|||||||
ExpressionType& m_expression;
|
ExpressionType& m_expression;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef EIGEN_USE_THREADS
|
|
||||||
|
|
||||||
/** \class TensorAsyncDevice
|
/** \class TensorAsyncDevice
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
*
|
*
|
||||||
* \brief Pseudo expression providing an operator = that will evaluate its
|
* \brief Pseudo expression providing an operator = that will evaluate its
|
||||||
* argument asynchronously on the specified device (currently supports only
|
* argument asynchronously on the specified device. Currently only
|
||||||
* ThreadPoolDevice).
|
* ThreadPoolDevice implements proper asynchronous execution, while the default
|
||||||
*
|
* and GPU devices just run the expression synchronously and call m_done() on
|
||||||
* Example:
|
* completion..
|
||||||
* auto done = []() { ... expression evaluation done ... };
|
*
|
||||||
* C.device(EIGEN_THREAD_POOL, std::move(done)) = A + B;
|
* Example:
|
||||||
|
* auto done = []() { ... expression evaluation done ... };
|
||||||
|
* C.device(thread_pool_device, std::move(done)) = A + B;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <typename ExpressionType, typename DeviceType, typename DoneCallback>
|
template <typename ExpressionType, typename DeviceType, typename DoneCallback>
|
||||||
@ -87,11 +87,11 @@ class TensorAsyncDevice {
|
|||||||
template <typename OtherDerived>
|
template <typename OtherDerived>
|
||||||
EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
|
EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
|
||||||
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
|
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
|
||||||
typedef internal::TensorAsyncExecutor<const Assign, DeviceType, DoneCallback> Executor;
|
typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
|
||||||
|
|
||||||
// WARNING: After assignment 'm_done' callback will be in undefined state.
|
|
||||||
Assign assign(m_expression, other);
|
Assign assign(m_expression, other);
|
||||||
Executor::runAsync(assign, m_device, std::move(m_done));
|
Executor::run(assign, m_device);
|
||||||
|
m_done();
|
||||||
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -102,7 +102,33 @@ class TensorAsyncDevice {
|
|||||||
DoneCallback m_done;
|
DoneCallback m_done;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // EIGEN_USE_THREADS
|
|
||||||
|
#ifdef EIGEN_USE_THREADS
|
||||||
|
template <typename ExpressionType, typename DoneCallback>
|
||||||
|
class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
|
||||||
|
public:
|
||||||
|
TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
|
||||||
|
DoneCallback done)
|
||||||
|
: m_device(device), m_expression(expression), m_done(std::move(done)) {}
|
||||||
|
|
||||||
|
template <typename OtherDerived>
|
||||||
|
EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
|
||||||
|
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
|
||||||
|
typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
|
||||||
|
|
||||||
|
// WARNING: After assignment 'm_done' callback will be in undefined state.
|
||||||
|
Assign assign(m_expression, other);
|
||||||
|
Executor::runAsync(assign, m_device, std::move(m_done));
|
||||||
|
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const ThreadPoolDevice& m_device;
|
||||||
|
ExpressionType& m_expression;
|
||||||
|
DoneCallback m_done;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
@ -151,6 +151,16 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
return m_impl.evalSubExprsIfNeeded(m_buffer);
|
return m_impl.evalSubExprsIfNeeded(m_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_THREADS
|
||||||
|
template <typename EvalSubExprsCallback>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
|
||||||
|
EvaluatorPointerType scalar, EvalSubExprsCallback done) {
|
||||||
|
EIGEN_UNUSED_VARIABLE(scalar);
|
||||||
|
eigen_assert(scalar == NULL);
|
||||||
|
m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
||||||
m_buffer[i] = m_impl.coeff(i);
|
m_buffer[i] = m_impl.coeff(i);
|
||||||
}
|
}
|
||||||
|
@ -102,7 +102,7 @@ class TensorExecutor {
|
|||||||
* available for ThreadPoolDevice (see definition below).
|
* available for ThreadPoolDevice (see definition below).
|
||||||
*/
|
*/
|
||||||
template <typename Expression, typename Device, typename DoneCallback,
|
template <typename Expression, typename Device, typename DoneCallback,
|
||||||
bool Vectorizable, bool Tileable>
|
bool Vectorizable, TiledEvaluation Tiling>
|
||||||
class TensorAsyncExecutor {};
|
class TensorAsyncExecutor {};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -544,9 +544,9 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename Expression, typename DoneCallback, bool Vectorizable,
|
template <typename Expression, typename DoneCallback, bool Vectorizable,
|
||||||
bool Tileable>
|
TiledEvaluation Tiling>
|
||||||
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||||
Vectorizable, Tileable> {
|
Vectorizable, Tiling> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index StorageIndex;
|
typedef typename Expression::Index StorageIndex;
|
||||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
@ -598,7 +598,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
|
|
||||||
template <typename Expression, typename DoneCallback, bool Vectorizable>
|
template <typename Expression, typename DoneCallback, bool Vectorizable>
|
||||||
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||||
Vectorizable, /*Tileable*/ true> {
|
Vectorizable, /*Tileable*/ TiledEvaluation::Legacy> {
|
||||||
public:
|
public:
|
||||||
typedef typename traits<Expression>::Index StorageIndex;
|
typedef typename traits<Expression>::Index StorageIndex;
|
||||||
typedef typename traits<Expression>::Scalar Scalar;
|
typedef typename traits<Expression>::Scalar Scalar;
|
||||||
@ -607,7 +607,9 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
static const int NumDims = traits<Expression>::NumDimensions;
|
static const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
|
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
|
||||||
|
Evaluator::Layout>
|
||||||
|
BlockMapper;
|
||||||
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
||||||
|
|
||||||
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
|
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
|
||||||
@ -624,7 +626,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
auto delete_ctx = [ctx]() { delete ctx; };
|
auto delete_ctx = [ctx]() { delete ctx; };
|
||||||
internal::TensorAsyncExecutor<
|
internal::TensorAsyncExecutor<
|
||||||
Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
|
Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
|
||||||
/*Tileable*/ false>::runAsync(expr, device, std::move(delete_ctx));
|
/*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -635,22 +637,102 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx->tiling =
|
ctx->tiling =
|
||||||
GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(
|
||||||
Vectorizable>(device, ctx->evaluator);
|
device, ctx->evaluator);
|
||||||
|
|
||||||
device.parallelForAsync(
|
auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
|
||||||
ctx->tiling.block_mapper.total_block_count(), ctx->tiling.cost,
|
ScalarNoConst* thread_buf =
|
||||||
[ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
|
ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
|
||||||
ScalarNoConst* thread_buf =
|
ctx->device);
|
||||||
ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
|
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||||
ctx->device);
|
auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
|
||||||
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
ctx->evaluator.evalBlock(&block);
|
||||||
auto block =
|
}
|
||||||
ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
|
};
|
||||||
ctx->evaluator.evalBlock(&block);
|
device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
|
||||||
}
|
ctx->tiling.cost, eval_block,
|
||||||
},
|
[ctx]() { delete ctx; });
|
||||||
[ctx]() { delete ctx; });
|
};
|
||||||
|
|
||||||
|
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct TensorAsyncExecutorContext {
|
||||||
|
TensorAsyncExecutorContext(const Expression& expr,
|
||||||
|
const ThreadPoolDevice& thread_pool,
|
||||||
|
DoneCallback done)
|
||||||
|
: device(thread_pool),
|
||||||
|
evaluator(expr, thread_pool),
|
||||||
|
on_done(std::move(done)) {}
|
||||||
|
|
||||||
|
~TensorAsyncExecutorContext() {
|
||||||
|
on_done();
|
||||||
|
device.deallocate(tiling.buffer);
|
||||||
|
evaluator.cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
const ThreadPoolDevice& device;
|
||||||
|
Evaluator evaluator;
|
||||||
|
TilingContext tiling;
|
||||||
|
|
||||||
|
private:
|
||||||
|
DoneCallback on_done;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Expression, typename DoneCallback, bool Vectorizable>
|
||||||
|
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||||
|
Vectorizable, /*Tileable*/ TiledEvaluation::On> {
|
||||||
|
public:
|
||||||
|
typedef typename traits<Expression>::Index IndexType;
|
||||||
|
typedef typename traits<Expression>::Scalar Scalar;
|
||||||
|
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||||
|
|
||||||
|
static const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
|
typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
|
||||||
|
Evaluator::Layout>
|
||||||
|
BlockMapper;
|
||||||
|
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
||||||
|
|
||||||
|
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
|
||||||
|
TensorBlockScratch;
|
||||||
|
|
||||||
|
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
|
||||||
|
const ThreadPoolDevice& device,
|
||||||
|
DoneCallback done) {
|
||||||
|
|
||||||
|
TensorAsyncExecutorContext* const ctx =
|
||||||
|
new TensorAsyncExecutorContext(expr, device, std::move(done));
|
||||||
|
|
||||||
|
const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
|
||||||
|
if (!need_assign) {
|
||||||
|
delete ctx;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->tiling =
|
||||||
|
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
||||||
|
Vectorizable>(
|
||||||
|
ctx->device, ctx->evaluator, /*allocate_buffer=*/false);
|
||||||
|
|
||||||
|
auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
|
||||||
|
TensorBlockScratch scratch(ctx->device);
|
||||||
|
|
||||||
|
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
|
||||||
|
++block_idx) {
|
||||||
|
auto block =
|
||||||
|
ctx->tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
|
||||||
|
TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
|
||||||
|
ctx->evaluator.evalBlockV2(desc, scratch);
|
||||||
|
scratch.reset();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
ctx->device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
|
||||||
|
ctx->tiling.cost, eval_block, [ctx]() { delete ctx; });
|
||||||
};
|
};
|
||||||
|
|
||||||
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
|
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
|
||||||
@ -682,7 +764,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
|||||||
|
|
||||||
#endif // EIGEN_USE_THREADS
|
#endif // EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
|
||||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||||
#if defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
|
@ -132,14 +132,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
|||||||
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
|
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
|
||||||
const Index numValues = internal::array_prod(m_impl.dimensions());
|
const Index numValues = internal::array_prod(m_impl.dimensions());
|
||||||
m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
|
m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
|
||||||
#ifndef EIGEN_USE_SYCL
|
|
||||||
// Should initialize the memory in case we're dealing with non POD types.
|
|
||||||
if (NumTraits<CoeffReturnType>::RequireInitialization) {
|
|
||||||
for (Index i = 0; i < numValues; ++i) {
|
|
||||||
new(m_buffer+i) CoeffReturnType();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
||||||
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
|
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
|
||||||
|
|
||||||
@ -151,6 +143,29 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_THREADS
|
||||||
|
template <typename EvalSubExprsCallback>
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync(
|
||||||
|
EvaluatorPointerType, EvalSubExprsCallback done) {
|
||||||
|
const Index numValues = internal::array_prod(m_impl.dimensions());
|
||||||
|
m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
|
||||||
|
numValues * sizeof(CoeffReturnType)));
|
||||||
|
typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
|
||||||
|
EvalTo;
|
||||||
|
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
|
||||||
|
|
||||||
|
auto on_done = std::bind([](EvalSubExprsCallback done) { done(true); },
|
||||||
|
std::move(done));
|
||||||
|
internal::TensorAsyncExecutor<
|
||||||
|
const EvalTo, typename internal::remove_const<Device>::type,
|
||||||
|
decltype(on_done),
|
||||||
|
/*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
|
||||||
|
/*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
|
||||||
|
runAsync(evalToTmp, m_device, std::move(on_done));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||||
m_device.deallocate_temp(m_buffer);
|
m_device.deallocate_temp(m_buffer);
|
||||||
m_buffer = NULL;
|
m_buffer = NULL;
|
||||||
|
@ -185,12 +185,12 @@ template <typename Expression, typename Device,
|
|||||||
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
|
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
|
||||||
class TensorExecutor;
|
class TensorExecutor;
|
||||||
|
|
||||||
// TODO(ezhulenev): Add TiledEvaluation support to async executor.
|
|
||||||
template <typename Expression, typename Device, typename DoneCallback,
|
template <typename Expression, typename Device, typename DoneCallback,
|
||||||
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||||
bool Tileable = IsTileable<Device, Expression>::BlockAccess>
|
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
|
||||||
class TensorAsyncExecutor;
|
class TensorAsyncExecutor;
|
||||||
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -205,6 +205,14 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_THREADS
|
||||||
|
template <typename EvalSubExprsCallback>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
|
||||||
|
EvaluatorPointerType data, EvalSubExprsCallback done) {
|
||||||
|
m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
|
||||||
return m_impl.evalSubExprsIfNeeded(data);
|
return m_impl.evalSubExprsIfNeeded(data);
|
||||||
}
|
}
|
||||||
|
@ -689,15 +689,14 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE
|
EIGEN_STRONG_INLINE
|
||||||
#if !defined(EIGEN_HIPCC)
|
#if !defined(EIGEN_HIPCC)
|
||||||
// Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same for all the functions
|
// Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
|
||||||
// being called within here, which then leads to proliferation of EIGEN_DEVICE_FUNC markings, one
|
// for all the functions being called within here, which then leads to
|
||||||
// of which will eventually result in an NVCC error
|
// proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
|
||||||
EIGEN_DEVICE_FUNC
|
// result in an NVCC error
|
||||||
#endif
|
EIGEN_DEVICE_FUNC
|
||||||
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
|
#endif
|
||||||
m_impl.evalSubExprsIfNeeded(NULL);
|
bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
|
||||||
|
|
||||||
// Use the FullReducer if possible.
|
// Use the FullReducer if possible.
|
||||||
if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
|
if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
|
||||||
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||||
@ -802,6 +801,34 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_THREADS
|
||||||
|
template <typename EvalSubExprsCallback>
|
||||||
|
EIGEN_STRONG_INLINE
|
||||||
|
#if !defined(EIGEN_HIPCC)
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
#endif
|
||||||
|
void
|
||||||
|
evalSubExprsIfNeededAsync(EvaluatorPointerType data,
|
||||||
|
EvalSubExprsCallback done) {
|
||||||
|
m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
|
||||||
|
done(evalSubExprsIfNeededCommon(data));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE
|
||||||
|
#if !defined(EIGEN_HIPCC)
|
||||||
|
// Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
|
||||||
|
// for all the functions being called within here, which then leads to
|
||||||
|
// proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
|
||||||
|
// result in an NVCC error
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
#endif
|
||||||
|
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
|
||||||
|
m_impl.evalSubExprsIfNeeded(NULL);
|
||||||
|
return evalSubExprsIfNeededCommon(data);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||||
m_impl.cleanup();
|
m_impl.cleanup();
|
||||||
if (m_result) {
|
if (m_result) {
|
||||||
|
@ -604,11 +604,10 @@ static void test_async_execute_unary_expr(Device d)
|
|||||||
Eigen::Barrier done(1);
|
Eigen::Barrier done(1);
|
||||||
auto on_done = [&done]() { done.Notify(); };
|
auto on_done = [&done]() { done.Notify(); };
|
||||||
|
|
||||||
static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true;
|
|
||||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
using DoneCallback = decltype(on_done);
|
using DoneCallback = decltype(on_done);
|
||||||
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
||||||
Vectorizable, TilingOn>;
|
Vectorizable, Tiling>;
|
||||||
|
|
||||||
Executor::runAsync(Assign(dst, expr), d, on_done);
|
Executor::runAsync(Assign(dst, expr), d, on_done);
|
||||||
done.Wait();
|
done.Wait();
|
||||||
@ -641,11 +640,10 @@ static void test_async_execute_binary_expr(Device d)
|
|||||||
Eigen::Barrier done(1);
|
Eigen::Barrier done(1);
|
||||||
auto on_done = [&done]() { done.Notify(); };
|
auto on_done = [&done]() { done.Notify(); };
|
||||||
|
|
||||||
static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true;
|
|
||||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
using DoneCallback = decltype(on_done);
|
using DoneCallback = decltype(on_done);
|
||||||
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
||||||
Vectorizable, TilingOn>;
|
Vectorizable, Tiling>;
|
||||||
|
|
||||||
Executor::runAsync(Assign(dst, expr), d, on_done);
|
Executor::runAsync(Assign(dst, expr), d, on_done);
|
||||||
done.Wait();
|
done.Wait();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user