mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-13 18:03:13 +08:00
Block evaluation for TensorGenerator/TensorReverse/TensorShuffling
This commit is contained in:
parent
39fb9eeccf
commit
d380c23b2c
@ -242,9 +242,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
|||||||
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
|
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
|
||||||
}
|
}
|
||||||
|
|
||||||
RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
|
RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
|
||||||
// If block was evaluated into a destination, there is no need to do
|
// If block was evaluated into a destination, there is no need to do assignment.
|
||||||
// assignment.
|
|
||||||
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
|
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
|
||||||
m_leftImpl.writeBlockV2(desc, block);
|
m_leftImpl.writeBlockV2(desc, block);
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,12 @@ EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
|
|||||||
return strides;
|
return strides;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int Layout, typename IndexType, size_t NumDims>
|
||||||
|
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
|
||||||
|
const Eigen::array<IndexType, NumDims>& dimensions) {
|
||||||
|
return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
|
||||||
|
}
|
||||||
|
|
||||||
#if EIGEN_HAS_CXX11
|
#if EIGEN_HAS_CXX11
|
||||||
template <int Layout, std::ptrdiff_t... Indices>
|
template <int Layout, std::ptrdiff_t... Indices>
|
||||||
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
|
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
|
||||||
@ -78,42 +84,6 @@ class TensorBlockDescriptor {
|
|||||||
return static_cast<Scalar*>(m_data);
|
return static_cast<Scalar*>(m_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
friend class TensorBlockDescriptor;
|
|
||||||
|
|
||||||
DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
|
|
||||||
|
|
||||||
template <typename Scalar>
|
|
||||||
DestinationBuffer(Scalar* data, const Dimensions& dimensions,
|
|
||||||
const Dimensions& strides, size_t total_dst_bytes)
|
|
||||||
: m_data(static_cast<void*>(data)),
|
|
||||||
m_dimensions(dimensions),
|
|
||||||
m_strides(strides),
|
|
||||||
m_total_dst_bytes(total_dst_bytes) {
|
|
||||||
// TODO(ezhulenev): Benchmark template meta-unroll for this loop.
|
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
|
||||||
m_dimensions[i] *= sizeof(Scalar);
|
|
||||||
m_strides[i] *= sizeof(Scalar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns true if the tensor block corresponding to `desc` fits into the
|
|
||||||
// contiguous block of memory defined by `*this`.
|
|
||||||
template <typename Scalar, int Layout>
|
|
||||||
bool fitsContiguously(const TensorBlockDescriptor& desc) const {
|
|
||||||
if (m_data == NULL) return false;
|
|
||||||
|
|
||||||
const Dimensions& desc_dims = desc.dimensions();
|
|
||||||
const Dimensions& dst_dims = dimensions<Scalar>();
|
|
||||||
|
|
||||||
if (!dimensions_match(desc_dims, dst_dims)) return false;
|
|
||||||
|
|
||||||
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
|
|
||||||
const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
|
|
||||||
|
|
||||||
return dimensions_match(desc_strides, dst_strides);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Scalar>
|
template <typename Scalar>
|
||||||
Dimensions dimensions() const {
|
Dimensions dimensions() const {
|
||||||
Dimensions dimensions;
|
Dimensions dimensions;
|
||||||
@ -134,6 +104,48 @@ class TensorBlockDescriptor {
|
|||||||
return strides;
|
return strides;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns true if the tensor block corresponding to `desc` fits into the
|
||||||
|
// contiguous block of memory defined by `*this`.
|
||||||
|
template <typename Scalar, int Layout>
|
||||||
|
bool fitsContiguously(const TensorBlockDescriptor& desc) const {
|
||||||
|
if (m_data == NULL) return false;
|
||||||
|
|
||||||
|
const Dimensions& desc_dims = desc.dimensions();
|
||||||
|
const Dimensions& dst_dims = dimensions<Scalar>();
|
||||||
|
|
||||||
|
if (!dimensions_match(desc_dims, dst_dims)) return false;
|
||||||
|
|
||||||
|
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
|
||||||
|
const Dimensions& dst_strides = strides<Scalar>();
|
||||||
|
|
||||||
|
// Compare strides ignoring dimensions of size `1`.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
if (desc_dims[i] == 1) continue;
|
||||||
|
if (desc_strides[i] != dst_strides[i]) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class TensorBlockDescriptor;
|
||||||
|
|
||||||
|
DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
DestinationBuffer(Scalar* data, const Dimensions& dimensions,
|
||||||
|
const Dimensions& strides, size_t total_dst_bytes)
|
||||||
|
: m_data(static_cast<void*>(data)),
|
||||||
|
m_dimensions(dimensions),
|
||||||
|
m_strides(strides),
|
||||||
|
m_total_dst_bytes(total_dst_bytes) {
|
||||||
|
// TODO(ezhulenev): Benchmark template meta-unroll for this loop.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
m_dimensions[i] *= sizeof(Scalar);
|
||||||
|
m_strides[i] *= sizeof(Scalar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void* m_data;
|
void* m_data;
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
Dimensions m_strides;
|
Dimensions m_strides;
|
||||||
@ -181,6 +193,12 @@ class TensorBlockDescriptor {
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }
|
||||||
|
|
||||||
|
const DestinationBuffer& GetDestinationBuffer() const {
|
||||||
|
return m_destination;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns a non-nullptr pointer to a destination buffer memory if this
|
// Returns a non-nullptr pointer to a destination buffer memory if this
|
||||||
// block has a contiguous destination buffer.
|
// block has a contiguous destination buffer.
|
||||||
template <typename Scalar, int Layout>
|
template <typename Scalar, int Layout>
|
||||||
@ -191,6 +209,11 @@ class TensorBlockDescriptor {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns a copy of `*this` with updated offset.
|
||||||
|
TensorBlockDescriptor WithOffset(IndexType offset) const {
|
||||||
|
return TensorBlockDescriptor(offset, m_dimensions, m_destination);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Offset and dimensions are immutable after construction. Block descriptor
|
// Offset and dimensions are immutable after construction. Block descriptor
|
||||||
// can only be mutated by adding or dropping destination.
|
// can only be mutated by adding or dropping destination.
|
||||||
@ -294,18 +317,12 @@ enum TensorBlockKind {
|
|||||||
|
|
||||||
// Tensor block that was materialized directly into the final output memory
|
// Tensor block that was materialized directly into the final output memory
|
||||||
// buffer. For example if the left side of an assignment is a Tensor, we can
|
// buffer. For example if the left side of an assignment is a Tensor, we can
|
||||||
// directly materialize the block in the destination memory. The block
|
// directly materialize the block in the destination memory.
|
||||||
// expression is still a valid Tensor expression, and can be used to build
|
//
|
||||||
// lazy expressions.
|
// If strides in the output buffer do not match tensor block strides, the
|
||||||
|
// Tensor expression will be invalid, and should not be used by
|
||||||
|
// TensorBlockAssign or for constructing another block expression.
|
||||||
kMaterializedInOutput
|
kMaterializedInOutput
|
||||||
|
|
||||||
// TODO(ezhulenev): If we know that we are evaluating a block, for the root of
|
|
||||||
// the expression tree, it might be beneficial to do an assignment to the
|
|
||||||
// output memory buffer, even if it will be impossible to construct a valid
|
|
||||||
// block expression after that (e.g. output memory buffer has strides not
|
|
||||||
// compatible with TensorMap). This might be a performance optimization for
|
|
||||||
// uniformly shaped blocks, because for blocks skewed towards inner dimension
|
|
||||||
// `kMaterializedInOutput` should always work.
|
|
||||||
};
|
};
|
||||||
#if !EIGEN_HAS_CXX11
|
#if !EIGEN_HAS_CXX11
|
||||||
} // namespace TensorBlockKind
|
} // namespace TensorBlockKind
|
||||||
@ -346,6 +363,11 @@ struct XprScalar<void> {
|
|||||||
// Tensor), or a memory buffer allocated with scratch allocator, and in this
|
// Tensor), or a memory buffer allocated with scratch allocator, and in this
|
||||||
// case the scratch allocator will deallocate it at the end of block based
|
// case the scratch allocator will deallocate it at the end of block based
|
||||||
// expression execution.
|
// expression execution.
|
||||||
|
//
|
||||||
|
// If the block was evaluated directly into the output buffer, and strides in
|
||||||
|
// the output buffer do not match block strides, the TensorMap expression will
|
||||||
|
// be invalid, and should never be used in block assignment or any other tensor
|
||||||
|
// expression.
|
||||||
|
|
||||||
template <typename Scalar, int NumDims, int Layout,
|
template <typename Scalar, int NumDims, int Layout,
|
||||||
typename IndexType = Eigen::Index>
|
typename IndexType = Eigen::Index>
|
||||||
@ -358,11 +380,12 @@ class TensorMaterializedBlock {
|
|||||||
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
|
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
|
||||||
|
|
||||||
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
|
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
|
||||||
const Dimensions& dimensions)
|
const Dimensions& dimensions, bool valid_expr = true)
|
||||||
: m_kind(kind),
|
: m_kind(kind),
|
||||||
m_data(data),
|
m_data(data),
|
||||||
m_dimensions(dimensions),
|
m_dimensions(dimensions),
|
||||||
m_expr(m_data, m_dimensions) {
|
m_expr(m_data, m_dimensions),
|
||||||
|
m_valid_expr(valid_expr) {
|
||||||
eigen_assert(m_kind == internal::TensorBlockKind::kView ||
|
eigen_assert(m_kind == internal::TensorBlockKind::kView ||
|
||||||
m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
|
m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
|
||||||
m_kind == internal::TensorBlockKind::kMaterializedInOutput);
|
m_kind == internal::TensorBlockKind::kMaterializedInOutput);
|
||||||
@ -372,7 +395,10 @@ class TensorMaterializedBlock {
|
|||||||
// NOTE(ezhulenev): Returning XprType by value like in other block types
|
// NOTE(ezhulenev): Returning XprType by value like in other block types
|
||||||
// causes asan failures. The theory is that XprType::Nested doesn't work
|
// causes asan failures. The theory is that XprType::Nested doesn't work
|
||||||
// properly for TensorMap.
|
// properly for TensorMap.
|
||||||
const XprType& expr() const { return m_expr; }
|
const XprType& expr() const {
|
||||||
|
eigen_assert(m_valid_expr);
|
||||||
|
return m_expr;
|
||||||
|
}
|
||||||
const Scalar* data() const { return m_data; }
|
const Scalar* data() const { return m_data; }
|
||||||
void cleanup() {}
|
void cleanup() {}
|
||||||
|
|
||||||
@ -427,6 +453,7 @@ class TensorMaterializedBlock {
|
|||||||
bool materialized_in_output;
|
bool materialized_in_output;
|
||||||
|
|
||||||
if (block_buffer != NULL) {
|
if (block_buffer != NULL) {
|
||||||
|
desc.DropDestinationBuffer();
|
||||||
materialized_in_output = true;
|
materialized_in_output = true;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@ -461,6 +488,7 @@ class TensorMaterializedBlock {
|
|||||||
const Scalar* m_data;
|
const Scalar* m_data;
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
XprType m_expr;
|
XprType m_expr;
|
||||||
|
bool m_valid_expr;
|
||||||
};
|
};
|
||||||
|
|
||||||
// -------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------- //
|
||||||
|
@ -882,7 +882,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
static const bool
|
static const bool
|
||||||
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||||
|
|
||||||
|
@ -368,7 +368,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
const Index chip_dim = m_dim.actualDim();
|
const Index chip_dim = m_dim.actualDim();
|
||||||
|
|
||||||
DSizes<Index, NumInputDims> input_block_dims;
|
DSizes<Index, NumInputDims> input_block_dims;
|
||||||
@ -390,6 +391,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
|
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
|
||||||
|
if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
|
||||||
|
|
||||||
if (arg_block.data() != NULL) {
|
if (arg_block.data() != NULL) {
|
||||||
// Forward argument block buffer if possible.
|
// Forward argument block buffer if possible.
|
||||||
@ -405,6 +407,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
bool materialized_in_output;
|
bool materialized_in_output;
|
||||||
|
|
||||||
if (output_buffer != NULL) {
|
if (output_buffer != NULL) {
|
||||||
|
desc.DropDestinationBuffer();
|
||||||
materialized_in_output = true;
|
materialized_in_output = true;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -404,7 +404,8 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
return TensorBlockV2(m_impl.blockV2(desc, scratch),
|
return TensorBlockV2(m_impl.blockV2(desc, scratch),
|
||||||
TensorConversionOpBlockFactory());
|
TensorConversionOpBlockFactory());
|
||||||
}
|
}
|
||||||
|
@ -481,7 +481,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
|
|||||||
|
|
||||||
|
|
||||||
template <typename Dims1, typename Dims2>
|
template <typename Dims1, typename Dims2>
|
||||||
EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
|
||||||
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
|
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -166,7 +166,8 @@ struct TensorEvaluator
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
assert(m_data != NULL);
|
assert(m_data != NULL);
|
||||||
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
||||||
}
|
}
|
||||||
@ -353,7 +354,8 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
assert(m_data != NULL);
|
assert(m_data != NULL);
|
||||||
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
||||||
}
|
}
|
||||||
@ -571,7 +573,8 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
|
return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -729,7 +732,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
desc.DropDestinationBuffer();
|
desc.DropDestinationBuffer();
|
||||||
return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
|
return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
|
||||||
m_rightImpl.blockV2(desc, scratch), m_functor);
|
m_rightImpl.blockV2(desc, scratch), m_functor);
|
||||||
@ -993,7 +997,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
// It's unsafe to pass destination buffer to underlying expressions, because
|
// It's unsafe to pass destination buffer to underlying expressions, because
|
||||||
// output might be aliased with one of the inputs.
|
// output might be aliased with one of the inputs.
|
||||||
desc.DropDestinationBuffer();
|
desc.DropDestinationBuffer();
|
||||||
|
@ -521,19 +521,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
|||||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||||
const ThreadPoolDevice& device) {
|
const ThreadPoolDevice& device) {
|
||||||
Evaluator evaluator(expr, device);
|
Evaluator evaluator(expr, device);
|
||||||
Index total_size = array_prod(evaluator.dimensions());
|
|
||||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
|
||||||
|
|
||||||
// TODO(ezuhulenev): For small expressions cost of block mapping and
|
|
||||||
// resource requirements gathering dominates the cost of expression
|
|
||||||
// evaluatiuon.
|
|
||||||
if (total_size < cache_size &&
|
|
||||||
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
|
||||||
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
|
||||||
/*Tiling=*/TiledEvaluation::Off>::run(expr, device);
|
|
||||||
evaluator.cleanup();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
|
@ -176,7 +176,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
assert(m_buffer != NULL);
|
assert(m_buffer != NULL);
|
||||||
return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
|
return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
|
||||||
}
|
}
|
||||||
|
@ -238,7 +238,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
static const bool is_col_major =
|
static const bool is_col_major =
|
||||||
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||||
|
|
||||||
@ -253,6 +254,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
|||||||
bool materialized_in_output;
|
bool materialized_in_output;
|
||||||
|
|
||||||
if (block_buffer != NULL) {
|
if (block_buffer != NULL) {
|
||||||
|
desc.DropDestinationBuffer();
|
||||||
materialized_in_output = true;
|
materialized_in_output = true;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -365,7 +365,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
eigen_assert(m_impl.data() != NULL);
|
eigen_assert(m_impl.data() != NULL);
|
||||||
eigen_assert((kind == Runtime) ||
|
eigen_assert((kind == Runtime) ||
|
||||||
(kind == OneByN && desc.dimensions()[0] == 1) ||
|
(kind == OneByN && desc.dimensions()[0] == 1) ||
|
||||||
@ -611,7 +612,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||||
PreferBlockAccess = true,
|
PreferBlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
@ -624,7 +625,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
|
// Tensor slicing does not change the block type.
|
||||||
|
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
|
||||||
|
TensorBlockV2;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
@ -804,6 +810,15 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
m_impl.block(&input_block);
|
m_impl.block(&input_block);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
|
TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
|
||||||
|
TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch);
|
||||||
|
if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
|
||||||
|
return block;
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
|
||||||
typename Storage::Type result = constCast(m_impl.data());
|
typename Storage::Type result = constCast(m_impl.data());
|
||||||
if (result) {
|
if (result) {
|
||||||
@ -900,7 +915,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||||
PreferBlockAccess = true,
|
PreferBlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
@ -913,7 +928,8 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
@ -987,6 +1003,13 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
|
block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
|
||||||
const_cast<ScalarNoConst*>(block.data())));
|
const_cast<ScalarNoConst*>(block.data())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename TensorBlockV2>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||||
|
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||||
|
TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
|
||||||
|
this->m_impl.writeBlockV2(arg_desc, block);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
@ -230,7 +230,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
// If one of the dimensions is zero, return empty block view.
|
// If one of the dimensions is zero, return empty block view.
|
||||||
if (desc.size() == 0) {
|
if (desc.size() == 0) {
|
||||||
return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
|
return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
|
||||||
@ -240,8 +241,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
|
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
|
||||||
ScalarNoConst* materialized_output =
|
ScalarNoConst* materialized_output =
|
||||||
desc.template destination<ScalarNoConst, Layout>();
|
desc.template destination<ScalarNoConst, Layout>();
|
||||||
|
|
||||||
bool materialized_in_output;
|
bool materialized_in_output;
|
||||||
|
|
||||||
if (materialized_output != NULL) {
|
if (materialized_output != NULL) {
|
||||||
desc.DropDestinationBuffer();
|
desc.DropDestinationBuffer();
|
||||||
materialized_in_output = true;
|
materialized_in_output = true;
|
||||||
|
@ -355,7 +355,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool /*root_of_expr_ast*/ = false) const {
|
||||||
// TODO(ezhulenev): If underlying tensor expression supports and prefers
|
// TODO(ezhulenev): If underlying tensor expression supports and prefers
|
||||||
// block evaluation we must use it. Currently we use coeff and packet
|
// block evaluation we must use it. Currently we use coeff and packet
|
||||||
// access into the underlying tensor expression.
|
// access into the underlying tensor expression.
|
||||||
@ -370,10 +371,12 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
|
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
|
||||||
|
|
||||||
// Try to reuse destination as an output block buffer.
|
// Try to reuse destination as an output block buffer.
|
||||||
CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>();
|
CoeffReturnType* block_buffer =
|
||||||
|
desc.template destination<CoeffReturnType, Layout>();
|
||||||
bool materialized_in_output;
|
bool materialized_in_output;
|
||||||
|
|
||||||
if (block_buffer != NULL) {
|
if (block_buffer != NULL) {
|
||||||
|
desc.DropDestinationBuffer();
|
||||||
materialized_in_output = true;
|
materialized_in_output = true;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||||
PreferBlockAccess = true,
|
PreferBlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
@ -131,7 +131,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
|
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
|
||||||
|
Layout, Index>
|
||||||
|
TensorBlockV2;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||||
@ -143,6 +148,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
const Shuffle& shuffle = op.shufflePermutation();
|
const Shuffle& shuffle = op.shufflePermutation();
|
||||||
m_is_identity = true;
|
m_is_identity = true;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
m_shuffle[i] = static_cast<int>(shuffle[i]);
|
||||||
m_dimensions[i] = input_dims[shuffle[i]];
|
m_dimensions[i] = input_dims[shuffle[i]];
|
||||||
m_inverseShuffle[shuffle[i]] = i;
|
m_inverseShuffle[shuffle[i]] = i;
|
||||||
if (m_is_identity && shuffle[i] != i) {
|
if (m_is_identity && shuffle[i] != i) {
|
||||||
@ -241,7 +247,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
1, m_device.firstLevelCacheSize() / sizeof(Scalar));
|
1, m_device.firstLevelCacheSize() / sizeof(Scalar));
|
||||||
resources->push_back(internal::TensorOpResourceRequirements(
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
internal::kUniformAllDims, block_total_size_max));
|
internal::kUniformAllDims, block_total_size_max));
|
||||||
m_impl.getResourceRequirements(resources);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
@ -336,6 +341,78 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||||
|
bool root_of_expr_ast = false) const {
|
||||||
|
assert(m_impl.data() != NULL);
|
||||||
|
|
||||||
|
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
|
||||||
|
TensorBlockIO;
|
||||||
|
typedef typename TensorBlockIO::Dst TensorBlockIODst;
|
||||||
|
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
|
||||||
|
|
||||||
|
ScalarNoConst* block_buffer = NULL;
|
||||||
|
typename TensorBlockIO::Dimensions block_strides;
|
||||||
|
|
||||||
|
bool materialized_in_output = false;
|
||||||
|
bool has_valid_materialized_expr = true;
|
||||||
|
|
||||||
|
if (desc.HasDestinationBuffer()) {
|
||||||
|
// Check if we can reuse destination buffer for block materialization.
|
||||||
|
const typename TensorBlockDesc::DestinationBuffer& destination_buffer =
|
||||||
|
desc.GetDestinationBuffer();
|
||||||
|
|
||||||
|
const bool dims_match = dimensions_match(
|
||||||
|
desc.dimensions(), destination_buffer.template dimensions<Scalar>());
|
||||||
|
|
||||||
|
const bool strides_match =
|
||||||
|
dimensions_match(internal::strides<Layout>(desc.dimensions()),
|
||||||
|
destination_buffer.template strides<Scalar>());
|
||||||
|
|
||||||
|
if (dims_match && strides_match) {
|
||||||
|
// Destination buffer fits the block contiguously.
|
||||||
|
materialized_in_output = true;
|
||||||
|
has_valid_materialized_expr = true;
|
||||||
|
block_buffer = destination_buffer.template data<ScalarNoConst>();
|
||||||
|
block_strides = internal::strides<Layout>(desc.dimensions());
|
||||||
|
eigen_assert(block_buffer != NULL);
|
||||||
|
|
||||||
|
} else if (dims_match && root_of_expr_ast) {
|
||||||
|
// Destination buffer has strides not matching the block strides, but
|
||||||
|
// for the root of the expression tree it's safe to materialize anyway.
|
||||||
|
materialized_in_output = true;
|
||||||
|
has_valid_materialized_expr = false;
|
||||||
|
block_buffer = destination_buffer.template data<ScalarNoConst>();
|
||||||
|
block_strides = destination_buffer.template strides<ScalarNoConst>();
|
||||||
|
eigen_assert(block_buffer != NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (materialized_in_output) desc.DropDestinationBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we were not able to reuse destination buffer, allocate temporary
|
||||||
|
// buffer for block evaluation using scratch allocator.
|
||||||
|
if (!materialized_in_output) {
|
||||||
|
void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst));
|
||||||
|
block_buffer = static_cast<ScalarNoConst*>(mem);
|
||||||
|
block_strides = internal::strides<Layout>(desc.dimensions());
|
||||||
|
}
|
||||||
|
|
||||||
|
typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
|
||||||
|
TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
|
||||||
|
|
||||||
|
TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer);
|
||||||
|
|
||||||
|
typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
|
||||||
|
TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
|
||||||
|
|
||||||
|
return TensorBlockV2(
|
||||||
|
materialized_in_output
|
||||||
|
? internal::TensorBlockKind::kMaterializedInOutput
|
||||||
|
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||||
|
block_buffer, desc.dimensions(), has_valid_materialized_expr);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
|
const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
|
||||||
NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||||
@ -400,7 +477,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
|
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
bool m_is_identity;
|
bool m_is_identity;
|
||||||
array<Index, NumDims> m_inverseShuffle;
|
array<int, NumDims> m_shuffle;
|
||||||
|
array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
|
||||||
array<Index, NumDims> m_outputStrides;
|
array<Index, NumDims> m_outputStrides;
|
||||||
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||||
array<Index, NumDims> m_inputStrides;
|
array<Index, NumDims> m_inputStrides;
|
||||||
@ -431,7 +509,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||||
PreferBlockAccess = true,
|
PreferBlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -445,7 +523,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
@ -477,6 +555,63 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
this->m_inverseShuffle,
|
this->m_inverseShuffle,
|
||||||
this->m_unshuffledInputStrides, this->m_impl.data());
|
this->m_unshuffledInputStrides, this->m_impl.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename TensorBlockV2>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||||
|
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||||
|
eigen_assert(this->m_impl.data() != NULL);
|
||||||
|
|
||||||
|
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
|
||||||
|
TensorBlockIO;
|
||||||
|
typedef typename TensorBlockIO::Dst TensorBlockIODst;
|
||||||
|
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
|
||||||
|
|
||||||
|
const Scalar* block_buffer = block.data();
|
||||||
|
|
||||||
|
// TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
|
||||||
|
// expression with coefficient and packet access as `src`.
|
||||||
|
void* mem = NULL;
|
||||||
|
if (block_buffer == NULL) {
|
||||||
|
mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
|
||||||
|
ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
|
||||||
|
|
||||||
|
typedef internal::TensorBlockAssignment<
|
||||||
|
ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index>
|
||||||
|
TensorBlockAssignment;
|
||||||
|
|
||||||
|
TensorBlockAssignment::Run(
|
||||||
|
TensorBlockAssignment::target(
|
||||||
|
desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
|
||||||
|
buf),
|
||||||
|
block.expr());
|
||||||
|
|
||||||
|
block_buffer = buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read from block.
|
||||||
|
TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
|
||||||
|
block_buffer);
|
||||||
|
|
||||||
|
// Write to the output buffer.
|
||||||
|
typename TensorBlockIO::Dimensions output_strides(
|
||||||
|
this->m_unshuffledInputStrides);
|
||||||
|
typename TensorBlockIO::Dimensions output_dimensions;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
|
||||||
|
}
|
||||||
|
TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
|
||||||
|
this->srcCoeff(desc.offset()));
|
||||||
|
|
||||||
|
// Reorder dimensions according to the shuffle.
|
||||||
|
typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
|
||||||
|
}
|
||||||
|
TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
|
||||||
|
|
||||||
|
// Deallocate temporary buffer used for the block materialization.
|
||||||
|
if (mem != NULL) this->m_device.deallocate(mem);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -139,23 +139,50 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
|
|||||||
// Evaluate TensorBlock expression into a tensor.
|
// Evaluate TensorBlock expression into a tensor.
|
||||||
Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
|
Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
|
||||||
|
|
||||||
|
// Dimensions for the potential destination buffer.
|
||||||
|
DSizes<Index, NumDims> dst_dims;
|
||||||
|
if (internal::random<bool>()) {
|
||||||
|
dst_dims = block_params.desc.dimensions();
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
Index extent = internal::random<Index>(0, 5);
|
||||||
|
dst_dims[i] = block_params.desc.dimension(i) + extent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Maybe use this tensor as a block desc destination.
|
// Maybe use this tensor as a block desc destination.
|
||||||
Tensor<T, NumDims, Layout> dst(block_params.desc.dimensions());
|
Tensor<T, NumDims, Layout> dst(dst_dims);
|
||||||
|
dst.setZero();
|
||||||
if (internal::random<bool>()) {
|
if (internal::random<bool>()) {
|
||||||
block_params.desc.template AddDestinationBuffer(
|
block_params.desc.template AddDestinationBuffer(
|
||||||
dst.data(), internal::strides<Layout>(dst.dimensions()),
|
dst.data(), internal::strides<Layout>(dst.dimensions()),
|
||||||
dst.dimensions().TotalSize() * sizeof(T));
|
dst.dimensions().TotalSize() * sizeof(T));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto tensor_block = eval.blockV2(block_params.desc, scratch);
|
const bool root_of_expr = internal::random<bool>();
|
||||||
auto b_expr = tensor_block.expr();
|
auto tensor_block = eval.blockV2(block_params.desc, scratch, root_of_expr);
|
||||||
|
|
||||||
// We explicitly disable vectorization and tiling, to run a simple coefficient
|
if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
|
||||||
// wise assignment loop, because it's very simple and should be correct.
|
// Copy data from destination buffer.
|
||||||
using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
|
if (dimensions_match(dst.dimensions(), block.dimensions())) {
|
||||||
using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
|
block = dst;
|
||||||
internal::TiledEvaluation::Off>;
|
} else {
|
||||||
BlockExecutor::run(BlockAssign(block, b_expr), d);
|
DSizes<Index, NumDims> offsets;
|
||||||
|
for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
|
||||||
|
block = dst.slice(offsets, block.dimensions());
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Assign to block from expression.
|
||||||
|
auto b_expr = tensor_block.expr();
|
||||||
|
|
||||||
|
// We explicitly disable vectorization and tiling, to run a simple coefficient
|
||||||
|
// wise assignment loop, because it's very simple and should be correct.
|
||||||
|
using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
|
||||||
|
using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
|
||||||
|
internal::TiledEvaluation::Off>;
|
||||||
|
BlockExecutor::run(BlockAssign(block, b_expr), d);
|
||||||
|
}
|
||||||
|
|
||||||
// Cleanup temporary buffers owned by a tensor block.
|
// Cleanup temporary buffers owned by a tensor block.
|
||||||
tensor_block.cleanup();
|
tensor_block.cleanup();
|
||||||
@ -375,17 +402,16 @@ static void test_eval_tensor_generator() {
|
|||||||
Tensor<T, NumDims, Layout> input(dims);
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
input.setRandom();
|
input.setRandom();
|
||||||
|
|
||||||
auto generator = [](const array<Index, NumDims>& dims) -> T {
|
auto generator = [](const array<Index, NumDims>& coords) -> T {
|
||||||
T result = static_cast<T>(0);
|
T result = static_cast<T>(0);
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
result += static_cast<T>((i + 1) * dims[i]);
|
result += static_cast<T>((i + 1) * coords[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
VerifyBlockEvaluator<T, NumDims, Layout>(
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
input.generate(generator),
|
input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
|
||||||
[&dims]() { return FixedSizeBlock(dims); });
|
|
||||||
|
|
||||||
VerifyBlockEvaluator<T, NumDims, Layout>(
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
input.generate(generator),
|
input.generate(generator),
|
||||||
@ -403,12 +429,63 @@ static void test_eval_tensor_reverse() {
|
|||||||
for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
|
for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
|
||||||
|
|
||||||
VerifyBlockEvaluator<T, NumDims, Layout>(
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
input.reverse(reverse),
|
input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
|
||||||
[&dims]() { return FixedSizeBlock(dims); });
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
|
||||||
|
return RandomBlock<Layout>(dims, 1, 10);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_slice() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
// Pick a random slice of an input tensor.
|
||||||
|
DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
|
||||||
|
DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
|
||||||
|
|
||||||
|
// Make sure that slice start + size do not overflow tensor dims.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
||||||
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
||||||
|
}
|
||||||
|
|
||||||
VerifyBlockEvaluator<T, NumDims, Layout>(
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
input.reverse(reverse),
|
input.slice(slice_start, slice_size),
|
||||||
[&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
|
[&slice_size]() { return FixedSizeBlock(slice_size); });
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.slice(slice_start, slice_size),
|
||||||
|
[&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_shuffle() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
DSizes<Index, NumDims> shuffle;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
|
||||||
|
|
||||||
|
do {
|
||||||
|
DSizes<Index, NumDims> shuffled_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.shuffle(shuffle),
|
||||||
|
[&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.shuffle(shuffle), [&shuffled_dims]() {
|
||||||
|
return RandomBlock<Layout>(shuffled_dims, 1, 5);
|
||||||
|
});
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
} while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int Layout>
|
template <typename T, int Layout>
|
||||||
@ -564,7 +641,7 @@ static void test_assign_to_tensor_chipping() {
|
|||||||
Index chip_dim = internal::random<int>(0, NumDims - 1);
|
Index chip_dim = internal::random<int>(0, NumDims - 1);
|
||||||
Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
|
Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
|
||||||
|
|
||||||
DSizes < Index, NumDims - 1 > chipped_dims;
|
DSizes<Index, NumDims - 1> chipped_dims;
|
||||||
for (Index i = 0; i < chip_dim; ++i) {
|
for (Index i = 0; i < chip_dim; ++i) {
|
||||||
chipped_dims[i] = dims[i];
|
chipped_dims[i] = dims[i];
|
||||||
}
|
}
|
||||||
@ -587,42 +664,111 @@ static void test_assign_to_tensor_chipping() {
|
|||||||
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
|
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_assign_to_tensor_slice() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> tensor(dims);
|
||||||
|
|
||||||
|
// Pick a random slice of tensor.
|
||||||
|
DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
|
||||||
|
DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
|
||||||
|
|
||||||
|
// Make sure that slice start + size do not overflow tensor dims.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
||||||
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map.slice(slice_start, slice_size),
|
||||||
|
[&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map.slice(slice_start, slice_size),
|
||||||
|
[&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map.slice(slice_start, slice_size),
|
||||||
|
[&slice_size]() { return FixedSizeBlock(slice_size); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_assign_to_tensor_shuffle() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
|
||||||
|
Tensor<T, NumDims, Layout> tensor(dims);
|
||||||
|
|
||||||
|
DSizes<Index, NumDims> shuffle;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
|
||||||
|
|
||||||
|
TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
|
||||||
|
|
||||||
|
do {
|
||||||
|
DSizes<Index, NumDims> shuffled_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map.shuffle(shuffle),
|
||||||
|
[&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map.shuffle(shuffle), [&shuffled_dims]() {
|
||||||
|
return RandomBlock<Layout>(shuffled_dims, 1, 5);
|
||||||
|
});
|
||||||
|
|
||||||
|
} while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \
|
#define CALL_SUBTEST_PART(PART) \
|
||||||
CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
|
CALL_SUBTEST_##PART
|
||||||
CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
|
|
||||||
CALL_SUBTEST((NAME<float, 5, ColMajor>()))
|
|
||||||
|
|
||||||
#define CALL_SUBTESTS_LAYOUTS(NAME) \
|
#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME) \
|
||||||
CALL_SUBTEST((NAME<float, RowMajor>())); \
|
CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
|
||||||
CALL_SUBTEST((NAME<float, ColMajor>()))
|
CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
|
||||||
|
|
||||||
|
#define CALL_SUBTESTS_LAYOUTS(PART, NAME) \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
|
||||||
|
CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()))
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
|
||||||
// clang-format off
|
// clang-format off
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_block);
|
CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_block);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_unary_expr_block);
|
CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_expr_block);
|
CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_binary_expr_block);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_with_unary_expr_block);
|
CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
|
CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_broadcast);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
|
CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_reshape);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
|
CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_cast);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select);
|
CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_select);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
|
CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_padding);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping);
|
CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_chipping);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_generator);
|
CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_generator);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reverse);
|
CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_reverse);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_slice);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_shuffle);
|
||||||
|
|
||||||
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
|
CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_reshape_with_bcast);
|
||||||
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);
|
CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_forced_eval);
|
||||||
|
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_reshape);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_chipping);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_slice);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_shuffle);
|
||||||
|
|
||||||
|
// Force CMake to split this test.
|
||||||
|
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
|
||||||
|
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
|
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
|
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_chipping);
|
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,30 @@ using Eigen::internal::TiledEvaluation;
|
|||||||
// A set of tests to verify that different TensorExecutor strategies yields the
|
// A set of tests to verify that different TensorExecutor strategies yields the
|
||||||
// same results for all the ops, supporting tiled evaluation.
|
// same results for all the ops, supporting tiled evaluation.
|
||||||
|
|
||||||
|
// Default assignment that does no use block evaluation or vectorization.
|
||||||
|
// We assume that default coefficient evaluation is well tested and correct.
|
||||||
|
template <typename Dst, typename Expr>
|
||||||
|
static void DefaultAssign(Dst& dst, Expr expr) {
|
||||||
|
using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
|
||||||
|
using Executor =
|
||||||
|
Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
|
||||||
|
/*Vectorizable=*/false,
|
||||||
|
/*Tiling=*/TiledEvaluation::Off>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), DefaultDevice());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assignment with specified device and tiling strategy.
|
||||||
|
template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
|
||||||
|
typename Dst, typename Expr>
|
||||||
|
static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
|
||||||
|
using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
|
||||||
|
using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
|
||||||
|
Vectorizable, Tiling>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
}
|
||||||
|
|
||||||
template <int NumDims>
|
template <int NumDims>
|
||||||
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
||||||
array<Index, NumDims> dims;
|
array<Index, NumDims> dims;
|
||||||
@ -222,30 +246,32 @@ static void test_execute_shuffle_rvalue(Device d)
|
|||||||
Tensor<T, NumDims, Options, Index> src(dims);
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
src.setRandom();
|
src.setRandom();
|
||||||
|
|
||||||
// Create a random dimension re-ordering/shuffle.
|
DSizes<Index, NumDims> shuffle;
|
||||||
std::vector<Index> shuffle;
|
for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
|
||||||
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
|
||||||
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
|
||||||
|
|
||||||
const auto expr = src.shuffle(shuffle);
|
// Test all possible shuffle permutations.
|
||||||
|
do {
|
||||||
|
DSizes<Index, NumDims> shuffled_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
shuffled_dims[i] = dims[shuffle[i]];
|
||||||
|
}
|
||||||
|
|
||||||
// We assume that shuffling on a default device is tested and correct, so
|
const auto expr = src.shuffle(shuffle);
|
||||||
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
||||||
Tensor<T, NumDims, Options, Index> golden;
|
|
||||||
golden = expr;
|
|
||||||
|
|
||||||
// Now do the shuffling using configured tensor executor.
|
// We assume that shuffling on a default device is tested and correct, so
|
||||||
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
|
Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
|
||||||
|
DefaultAssign(golden, expr);
|
||||||
|
|
||||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
// Now do the shuffling using configured tensor executor.
|
||||||
using Executor =
|
Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
|
||||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
|
||||||
|
|
||||||
Executor::run(Assign(dst, expr), d);
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
|
||||||
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
} while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
|
||||||
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
@ -258,33 +284,30 @@ static void test_execute_shuffle_lvalue(Device d)
|
|||||||
Tensor<T, NumDims, Options, Index> src(dims);
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
src.setRandom();
|
src.setRandom();
|
||||||
|
|
||||||
// Create a random dimension re-ordering/shuffle.
|
DSizes<Index, NumDims> shuffle;
|
||||||
std::vector<Index> shuffle;
|
for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
|
||||||
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
|
||||||
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
|
||||||
|
|
||||||
array<Index, NumDims> shuffled_dims;
|
// Test all possible shuffle permutations.
|
||||||
for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
|
do {
|
||||||
|
DSizes<Index, NumDims> shuffled_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
|
||||||
|
|
||||||
// We assume that shuffling on a default device is tested and correct, so
|
// We assume that shuffling on a default device is tested and correct, so
|
||||||
// we can rely on it to verify correctness of tensor executor and tiling.
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
|
Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
|
||||||
golden.shuffle(shuffle) = src;
|
auto golden_shuffle = golden.shuffle(shuffle);
|
||||||
|
DefaultAssign(golden_shuffle, src);
|
||||||
|
|
||||||
// Now do the shuffling using configured tensor executor.
|
// Now do the shuffling using configured tensor executor.
|
||||||
Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
|
Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
|
||||||
|
auto dst_shuffle = dst.shuffle(shuffle);
|
||||||
|
DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
|
||||||
|
|
||||||
auto expr = dst.shuffle(shuffle);
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
|
||||||
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
|
} while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
|
||||||
using Executor =
|
|
||||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
|
||||||
|
|
||||||
Executor::run(Assign(expr, src), d);
|
|
||||||
|
|
||||||
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
||||||
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
@ -723,13 +746,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
|
||||||
@ -741,15 +764,15 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user