mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-22 01:29:35 +08:00
Add new TensorBlock api implementation + tests
This commit is contained in:
parent
ef9dfee7bd
commit
c97b208468
960
unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
Normal file
960
unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
Normal file
@ -0,0 +1,960 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
|
||||||
|
#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// Helper function to compute strides for densely stored buffer of given
|
||||||
|
// dimensions.
|
||||||
|
|
||||||
|
// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
|
||||||
|
// this function instead everywhere.
|
||||||
|
template <int Layout, typename IndexType, int NumDims>
|
||||||
|
EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides(
|
||||||
|
const DSizes<IndexType, NumDims>& dimensions) {
|
||||||
|
DSizes<IndexType, NumDims> strides;
|
||||||
|
if (NumDims == 0) return strides;
|
||||||
|
|
||||||
|
// TODO(ezhulenev): Use templates to unroll this loop (similar to
|
||||||
|
// h_array_reduce in CXX11meta.h)? Benchmark it.
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
strides[0] = 1;
|
||||||
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
|
strides[i] = strides[i - 1] * dimensions[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
strides[NumDims - 1] = 1;
|
||||||
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
|
strides[i] = strides[i + 1] * dimensions[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return strides;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockDescriptor specifies a block offset within a tensor and the block
|
||||||
|
// sizes along each of the tensor dimensions.
|
||||||
|
|
||||||
|
template <int NumDims, typename IndexType = Eigen::Index>
|
||||||
|
class TensorBlockDescriptor {
|
||||||
|
public:
|
||||||
|
typedef DSizes<IndexType, NumDims> Dimensions;
|
||||||
|
|
||||||
|
// If we evaluate a Tensor assignment, and expression on the left, already has
|
||||||
|
// a memory buffer, then we might do performance optimization, and evaluate
|
||||||
|
// the root expression directly into the memory, or maybe use it as temporary
|
||||||
|
// storage for some of the subexpressions, to avoid dynamic memory allocation.
|
||||||
|
//
|
||||||
|
// This is a type erased storage, because passing Scalar type through all the
|
||||||
|
// expression evaluation layers it way too many templates. Also it should be
|
||||||
|
// possible to use this destination as a temp buffer for materializing
|
||||||
|
// expressions with type, not matching the final output.
|
||||||
|
class DestinationBuffer {
|
||||||
|
public:
|
||||||
|
template <typename Scalar>
|
||||||
|
Scalar* data() const {
|
||||||
|
return static_cast<Scalar*>(m_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class TensorBlockDescriptor;
|
||||||
|
|
||||||
|
DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
DestinationBuffer(Scalar* data, const Dimensions& dimensions,
|
||||||
|
const Dimensions& strides, size_t total_dst_bytes)
|
||||||
|
: m_data(static_cast<void*>(data)),
|
||||||
|
m_dimensions(dimensions),
|
||||||
|
m_strides(strides),
|
||||||
|
m_total_dst_bytes(total_dst_bytes) {
|
||||||
|
// TODO(ezhulenev): Benchmark template meta-unroll for this loop.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
m_dimensions[i] *= sizeof(Scalar);
|
||||||
|
m_strides[i] *= sizeof(Scalar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true if the tensor block corresponding to `desc` fits into the
|
||||||
|
// contiguous block of memory defined by `*this`.
|
||||||
|
template <typename Scalar, int Layout>
|
||||||
|
bool fitsContiguously(const TensorBlockDescriptor& desc) const {
|
||||||
|
if (m_data == NULL) return false;
|
||||||
|
|
||||||
|
const Dimensions& desc_dims = desc.dimensions();
|
||||||
|
const Dimensions& dst_dims = dimensions<Scalar>();
|
||||||
|
|
||||||
|
if (!dimensions_match(desc_dims, dst_dims)) return false;
|
||||||
|
|
||||||
|
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
|
||||||
|
const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
|
||||||
|
|
||||||
|
return dimensions_match(desc_strides, dst_strides);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
Dimensions dimensions() const {
|
||||||
|
Dimensions dimensions;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
|
||||||
|
dimensions[i] = m_dimensions[i] / sizeof(Scalar);
|
||||||
|
}
|
||||||
|
return dimensions;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
Dimensions strides() const {
|
||||||
|
Dimensions strides;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
|
||||||
|
strides[i] = m_strides[i] / sizeof(Scalar);
|
||||||
|
}
|
||||||
|
return strides;
|
||||||
|
}
|
||||||
|
|
||||||
|
void* m_data;
|
||||||
|
Dimensions m_dimensions;
|
||||||
|
Dimensions m_strides;
|
||||||
|
|
||||||
|
// Total size of the memory buffer at the destination (typically the total
|
||||||
|
// size of the left hand side of an assignment expression). This can be the
|
||||||
|
// same as `array_prod(m_dimensions)` if the assignment target has just a
|
||||||
|
// single block, but typically it's a larger number.
|
||||||
|
size_t m_total_dst_bytes;
|
||||||
|
};
|
||||||
|
|
||||||
|
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
|
||||||
|
const DestinationBuffer& destination)
|
||||||
|
: m_offset(offset),
|
||||||
|
m_dimensions(dimensions),
|
||||||
|
m_destination(destination) {}
|
||||||
|
|
||||||
|
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
|
||||||
|
: m_offset(offset),
|
||||||
|
m_dimensions(dimensions),
|
||||||
|
m_destination(DestinationBuffer()) {}
|
||||||
|
|
||||||
|
IndexType offset() const { return m_offset; }
|
||||||
|
const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
IndexType dimension(int index) const { return m_dimensions[index]; }
|
||||||
|
IndexType size() const { return array_prod<IndexType>(m_dimensions); }
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
|
||||||
|
size_t total_dst_bytes) {
|
||||||
|
m_destination =
|
||||||
|
DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorBlockDescriptor& DropDestinationBuffer() {
|
||||||
|
m_destination.m_data = NULL;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns a non-nullptr pointer to a destination buffer memory if this
|
||||||
|
// block has a contiguous destination buffer.
|
||||||
|
template <typename Scalar, int Layout>
|
||||||
|
Scalar* destination() const {
|
||||||
|
if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
|
||||||
|
return m_destination.template data<Scalar>();
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Offset and dimensions are immutable after construction. Block descriptor
|
||||||
|
// can only be mutated by adding or dropping destination.
|
||||||
|
const IndexType m_offset;
|
||||||
|
const Dimensions m_dimensions;
|
||||||
|
DestinationBuffer m_destination;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockScratchAllocator is responsible for allocating temporary buffers
|
||||||
|
// for block evaluation (output or input block materialization). Given that
|
||||||
|
// Eigen expression traversal order is deterministic, all temporary allocations
|
||||||
|
// are happening in the same order, and usually have exactly the same size.
|
||||||
|
// Scratch allocator keeps a trace of all dynamic allocations, and after the
|
||||||
|
// first block evaluation is completed, we should be able to reuse all the
|
||||||
|
// temporary buffers for the next block evaluation.
|
||||||
|
|
||||||
|
template <typename Device>
|
||||||
|
class TensorBlockScratchAllocator {
|
||||||
|
public:
|
||||||
|
explicit TensorBlockScratchAllocator(const Device& device)
|
||||||
|
: m_device(device), m_allocation_index(0) {}
|
||||||
|
|
||||||
|
~TensorBlockScratchAllocator() {
|
||||||
|
for (size_t i = 0; i < m_allocations.size(); ++i) {
|
||||||
|
m_device.deallocate(m_allocations[i].ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* allocate(size_t size) {
|
||||||
|
// TODO(ezhulenev): Remove when replaced with inlined vector.
|
||||||
|
if (m_allocations.capacity() == 0) m_allocations.reserve(8);
|
||||||
|
|
||||||
|
// Check if we already have an existing allocation att current index.
|
||||||
|
const int num_allocations = static_cast<int>(m_allocations.size());
|
||||||
|
const bool has_allocation = m_allocation_index < num_allocations;
|
||||||
|
|
||||||
|
// Allocation index can't be larger than the number of allocations.
|
||||||
|
eigen_assert(m_allocation_index <= num_allocations);
|
||||||
|
|
||||||
|
// If we have existing allocation, and its size is larger or equal to
|
||||||
|
// requested size, we do nothing.
|
||||||
|
|
||||||
|
// If current allocation can't fit requested size, we deallocate it, and
|
||||||
|
// replace with a larger allocation.
|
||||||
|
if (has_allocation && m_allocations[m_allocation_index].size < size) {
|
||||||
|
m_device.deallocate(m_allocations[m_allocation_index].ptr);
|
||||||
|
m_allocations[m_allocation_index].ptr = m_device.allocate(size);
|
||||||
|
m_allocations[m_allocation_index].size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make a new allocation if we don't have and existing one.
|
||||||
|
if (!has_allocation) {
|
||||||
|
Allocation allocation;
|
||||||
|
allocation.ptr = m_device.allocate(size);
|
||||||
|
allocation.size = size;
|
||||||
|
m_allocations.push_back(allocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
|
||||||
|
eigen_assert(m_allocations[m_allocation_index].size >= size);
|
||||||
|
|
||||||
|
return m_allocations[m_allocation_index++].ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() { m_allocation_index = 0; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct Allocation {
|
||||||
|
void* ptr;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
const Device& m_device;
|
||||||
|
int m_allocation_index;
|
||||||
|
// TODO(ezhulenev): This should be an inlined vector.
|
||||||
|
std::vector<Allocation> m_allocations;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockKind represents all possible block kinds, that can be produced by
|
||||||
|
// TensorEvaluator::evalBlock function.
|
||||||
|
#if !EIGEN_HAS_CXX11
|
||||||
|
// To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace.
|
||||||
|
// (Use of enumeration in a nested name specifier is a c++11 extension).
|
||||||
|
namespace TensorBlockKind {
|
||||||
|
#endif
|
||||||
|
enum TensorBlockKind {
|
||||||
|
// Tensor block that is a lazy expression that must be assigned to a
|
||||||
|
// destination using TensorBlockAssign.
|
||||||
|
kExpr,
|
||||||
|
|
||||||
|
// Tensor block that is a view into a memory buffer owned by an underlying
|
||||||
|
// Tensor expression (e.g. it can be a view into a Tensor buffer).
|
||||||
|
kView,
|
||||||
|
|
||||||
|
// Tensor block that was materialized in a scratch memory buffer, allocated
|
||||||
|
// with TensorBlockScratchAllocator. This block must be copied to a
|
||||||
|
// destination, similar to a block of `kExpr` type.
|
||||||
|
kMaterializedInScratch,
|
||||||
|
|
||||||
|
// Tensor block that was materialized directly into the final output memory
|
||||||
|
// buffer. For example if the left side of an assignment is a Tensor, we can
|
||||||
|
// directly materialize the block in the destination memory. The block
|
||||||
|
// expression is still a valid Tensor expression, and can be used to build
|
||||||
|
// lazy expressions.
|
||||||
|
kMaterializedInOutput
|
||||||
|
|
||||||
|
// TODO(ezhulenev): If we know that we are evaluating a block, for the root of
|
||||||
|
// the expression tree, it might be beneficial to do an assignment to the
|
||||||
|
// output memory buffer, even if it will be impossible to construct a valid
|
||||||
|
// block expression after that (e.g. output memory buffer has strides not
|
||||||
|
// compatible with TensorMap). This might be a performance optimization for
|
||||||
|
// uniformly shaped blocks, because for blocks skewed towards inner dimension
|
||||||
|
// `kMaterializedInOutput` should always work.
|
||||||
|
};
|
||||||
|
#if !EIGEN_HAS_CXX11
|
||||||
|
} // namespace TensorBlockKind
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
|
||||||
|
// TensorEvaluators that do not support block evaluation.
|
||||||
|
|
||||||
|
class TensorBlockNotImplemented {
|
||||||
|
public:
|
||||||
|
typedef void XprType;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// XprScalar extracts Scalar type from the Eigen expressions (if expression type
|
||||||
|
// is not void). It's required to be able to define lazy block expression for
|
||||||
|
// argument types, that do not support block evaluation.
|
||||||
|
|
||||||
|
template <typename XprType>
|
||||||
|
struct XprScalar {
|
||||||
|
typedef typename XprType::Scalar type;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct XprScalar<void> {
|
||||||
|
typedef void type;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorMaterializedBlock is a fully evaluated block of the original tensor,
|
||||||
|
// and XprType is just a TensorMap over the data. This block type is typically
|
||||||
|
// used to materialize blocks of tensor expressions, that can't be efficiently
|
||||||
|
// represented as lazy Tensor expressions with fast coeff/packet operations,
|
||||||
|
// e.g. we materialize all broadcasts into evaluated blocks.
|
||||||
|
//
|
||||||
|
// TensorMaterializedBlock does not own its memory buffer, it's either a memory
|
||||||
|
// buffer that backs the original expression (e.g. block is just a view into a
|
||||||
|
// Tensor), or a memory buffer allocated with scratch allocator, and in this
|
||||||
|
// case the scratch allocator will deallocate it at the end of block based
|
||||||
|
// expression execution.
|
||||||
|
|
||||||
|
template <typename Scalar, int NumDims, int Layout,
|
||||||
|
typename IndexType = Eigen::Index>
|
||||||
|
class TensorMaterializedBlock {
|
||||||
|
#if !EIGEN_HAS_CXX11
|
||||||
|
typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
|
||||||
|
#endif
|
||||||
|
public:
|
||||||
|
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
|
||||||
|
|
||||||
|
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
|
||||||
|
const DSizes<IndexType, NumDims>& dimensions)
|
||||||
|
: m_kind(kind),
|
||||||
|
m_data(data),
|
||||||
|
m_dimensions(dimensions),
|
||||||
|
m_expr(m_data, m_dimensions) {
|
||||||
|
eigen_assert(m_kind == TensorBlockKind::kView ||
|
||||||
|
m_kind == TensorBlockKind::kMaterializedInScratch ||
|
||||||
|
m_kind == TensorBlockKind::kMaterializedInOutput);
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorBlockKind kind() const { return m_kind; }
|
||||||
|
// NOTE(ezhulenev): Returning XprType by value like in other block types
|
||||||
|
// causes asan failures. The theory is that XprType::Nested doesn't work
|
||||||
|
// properly for TensorMap.
|
||||||
|
const XprType& expr() const { return m_expr; }
|
||||||
|
const Scalar* data() const { return m_data; }
|
||||||
|
|
||||||
|
void cleanup() {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
TensorBlockKind m_kind;
|
||||||
|
const Scalar* m_data;
|
||||||
|
DSizes<IndexType, NumDims> m_dimensions;
|
||||||
|
XprType m_expr;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp
|
||||||
|
// functor to the blocks produced by the underlying Tensor expression.
|
||||||
|
|
||||||
|
template <typename UnaryOp, typename ArgTensorBlock>
|
||||||
|
class TensorCwiseUnaryBlock {
|
||||||
|
#if !EIGEN_HAS_CXX11
|
||||||
|
typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static const bool NoArgBlockAccess =
|
||||||
|
internal::is_void<typename ArgTensorBlock::XprType>::value;
|
||||||
|
|
||||||
|
public:
|
||||||
|
typedef typename conditional<
|
||||||
|
NoArgBlockAccess, void,
|
||||||
|
TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::type
|
||||||
|
XprType;
|
||||||
|
|
||||||
|
typedef typename XprScalar<XprType>::type Scalar;
|
||||||
|
|
||||||
|
TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
|
||||||
|
: m_arg_block(arg_block), m_functor(functor) {}
|
||||||
|
|
||||||
|
TensorBlockKind kind() const { return TensorBlockKind::kExpr; }
|
||||||
|
|
||||||
|
XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
|
||||||
|
const Scalar* data() const { return NULL; }
|
||||||
|
void cleanup() { m_arg_block.cleanup(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ArgTensorBlock m_arg_block;
|
||||||
|
UnaryOp m_functor;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp
|
||||||
|
// functor to the blocks produced by the underlying Tensor expression.
|
||||||
|
|
||||||
|
template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
|
||||||
|
class TensorCwiseBinaryBlock {
|
||||||
|
#if !EIGEN_HAS_CXX11
|
||||||
|
typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static const bool NoArgBlockAccess =
|
||||||
|
internal::is_void<typename LhsTensorBlock::XprType>::value ||
|
||||||
|
internal::is_void<typename RhsTensorBlock::XprType>::value;
|
||||||
|
|
||||||
|
public:
|
||||||
|
typedef typename conditional<
|
||||||
|
NoArgBlockAccess, void,
|
||||||
|
TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
|
||||||
|
const typename RhsTensorBlock::XprType> >::type
|
||||||
|
XprType;
|
||||||
|
|
||||||
|
typedef typename XprScalar<XprType>::type Scalar;
|
||||||
|
|
||||||
|
TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
|
||||||
|
const RhsTensorBlock& right_block,
|
||||||
|
const BinaryOp& functor)
|
||||||
|
: m_left_block(left_block),
|
||||||
|
m_right_block(right_block),
|
||||||
|
m_functor(functor) {}
|
||||||
|
|
||||||
|
TensorBlockKind kind() const { return TensorBlockKind::kExpr; }
|
||||||
|
|
||||||
|
XprType expr() const {
|
||||||
|
return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
|
||||||
|
}
|
||||||
|
|
||||||
|
const Scalar* data() const { return NULL; }
|
||||||
|
|
||||||
|
void cleanup() {
|
||||||
|
m_left_block.cleanup();
|
||||||
|
m_right_block.cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
LhsTensorBlock m_left_block;
|
||||||
|
RhsTensorBlock m_right_block;
|
||||||
|
BinaryOp m_functor;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// StridedLinearBufferCopy provides a method to copy data between two linear
|
||||||
|
// buffers with different strides, with optimized paths for scatter/gather.
|
||||||
|
|
||||||
|
template <typename Scalar, typename IndexType>
|
||||||
|
class StridedLinearBufferCopy {
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
enum {
|
||||||
|
Vectorizable = packet_traits<Scalar>::Vectorizable,
|
||||||
|
PacketSize = packet_traits<Scalar>::size
|
||||||
|
};
|
||||||
|
|
||||||
|
public:
|
||||||
|
struct Dst {
|
||||||
|
Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
|
||||||
|
|
||||||
|
IndexType offset;
|
||||||
|
IndexType stride;
|
||||||
|
Scalar* data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Src {
|
||||||
|
Src(IndexType o, IndexType s, const Scalar* d)
|
||||||
|
: offset(o), stride(s), data(d) {}
|
||||||
|
|
||||||
|
IndexType offset;
|
||||||
|
IndexType stride;
|
||||||
|
const Scalar* data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
|
||||||
|
const Src& src,
|
||||||
|
const size_t count) {
|
||||||
|
Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
|
||||||
|
src.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const IndexType count, const IndexType dst_offset,
|
||||||
|
const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
||||||
|
const IndexType src_offset, const IndexType src_stride,
|
||||||
|
const Scalar* EIGEN_RESTRICT src_data) {
|
||||||
|
const Scalar* src = &src_data[src_offset];
|
||||||
|
Scalar* dst = &dst_data[dst_offset];
|
||||||
|
|
||||||
|
if (!Vectorizable) {
|
||||||
|
for (Index i = 0; i < count; ++i) {
|
||||||
|
dst[i * dst_stride] = src[i * src_stride];
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const IndexType unrolled_size = count - 4 * PacketSize;
|
||||||
|
const IndexType vectorized_size = count - PacketSize;
|
||||||
|
IndexType i = 0;
|
||||||
|
|
||||||
|
if (src_stride == 1 && dst_stride == 1) {
|
||||||
|
// ******************************************************************** //
|
||||||
|
// Linear copy from `src` to `dst`.
|
||||||
|
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
Packet p = ploadu<Packet>(src + i + j * PacketSize);
|
||||||
|
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
Packet p = ploadu<Packet>(src + i);
|
||||||
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i] = src[i];
|
||||||
|
}
|
||||||
|
// ******************************************************************** //
|
||||||
|
} else if (src_stride == 1 && dst_stride != 1) {
|
||||||
|
// Scatter from `src` to `dst`.
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
Packet p = ploadu<Packet>(src + i);
|
||||||
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i * dst_stride] = src[i];
|
||||||
|
}
|
||||||
|
// ******************************************************************** //
|
||||||
|
} else if (src_stride == 0 && dst_stride == 1) {
|
||||||
|
// Fill `dst` with value at `*src`.
|
||||||
|
Packet p = pload1<Packet>(src);
|
||||||
|
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i] = *src;
|
||||||
|
}
|
||||||
|
// ******************************************************************** //
|
||||||
|
} else if (src_stride == 0 && dst_stride != 1) {
|
||||||
|
// Scatter `*src` into `dst`.
|
||||||
|
Packet p = pload1<Packet>(src);
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i * dst_stride] = *src;
|
||||||
|
}
|
||||||
|
// ******************************************************************** //
|
||||||
|
} else if (dst_stride == 1) {
|
||||||
|
// Gather from `src` into `dst`.
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
||||||
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i] = src[i * src_stride];
|
||||||
|
}
|
||||||
|
// ******************************************************************** //
|
||||||
|
} else {
|
||||||
|
// Random.
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i * dst_stride] = src[i * src_stride];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
|
||||||
|
// It's possible to specify src->dst dimension mapping for the copy operation.
|
||||||
|
// Dimensions of `dst` specify how many elements have to be copied, for the
|
||||||
|
// `src` we need to know only stride to navigate through source memory buffer.
|
||||||
|
|
||||||
|
template <typename Scalar, typename IndexType, int NumDims, int Layout>
|
||||||
|
class TensorBlockIOV2 {
|
||||||
|
static const bool IsColMajor = (Layout == ColMajor);
|
||||||
|
|
||||||
|
typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
|
||||||
|
|
||||||
|
public:
|
||||||
|
typedef DSizes<IndexType, NumDims> Dimensions;
|
||||||
|
|
||||||
|
struct Dst {
|
||||||
|
Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
|
||||||
|
IndexType dst_offset = 0)
|
||||||
|
: dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
|
||||||
|
|
||||||
|
Dimensions dims;
|
||||||
|
Dimensions strides;
|
||||||
|
Scalar* data;
|
||||||
|
IndexType offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Src {
|
||||||
|
Src(const Dimensions& src_strides, const Scalar* src,
|
||||||
|
IndexType src_offset = 0)
|
||||||
|
: strides(src_strides), data(src), offset(src_offset) {}
|
||||||
|
|
||||||
|
Dimensions strides;
|
||||||
|
const Scalar* data;
|
||||||
|
IndexType offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copies data to `dst` from `src`, using provided dimensions mapping:
|
||||||
|
//
|
||||||
|
// src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
|
||||||
|
//
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
|
||||||
|
const Dst& dst, const Src& src, const Dimensions& dst_to_src_dim_map) {
|
||||||
|
// Copy single scalar value from `src` to `dst`.
|
||||||
|
if (NumDims == 0) {
|
||||||
|
*(dst.data + dst.offset) = *(src.data + src.offset);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Both `dst` and `src` must have contiguous innermost dimension. We also
|
||||||
|
// accept the special case with stride '0', because it's used as a trick to
|
||||||
|
// implement broadcasting.
|
||||||
|
{
|
||||||
|
int inner_dim = IsColMajor ? 0 : NumDims - 1;
|
||||||
|
EIGEN_UNUSED_VARIABLE(inner_dim);
|
||||||
|
eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
|
||||||
|
eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give a shorter name to `dst_to_src_dim_map`.
|
||||||
|
const Dimensions& dim_map = dst_to_src_dim_map;
|
||||||
|
|
||||||
|
// Do not squeeze reordered inner dimensions.
|
||||||
|
int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
|
||||||
|
|
||||||
|
// NOTE: We find the innermost dimension (contiguous in memory) in the dst
|
||||||
|
// block, and we write data linearly into that dimension, reading it from
|
||||||
|
// the src. If dimensions are reordered, we might end up reading data from
|
||||||
|
// the src with `stride != 1`.
|
||||||
|
//
|
||||||
|
// NOTE: Random-Read/Linear-Write can be up to ~2X faster than
|
||||||
|
// Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
|
||||||
|
|
||||||
|
// Find the innermost dimension in the dst whose size is not 1. This is the
|
||||||
|
// effective inner dim.
|
||||||
|
IndexType num_size_one_inner_dims = 0;
|
||||||
|
for (int i = 0; i < num_squeezable_dims; ++i) {
|
||||||
|
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
|
||||||
|
if (dst.dims[dst_dim] != 1) break;
|
||||||
|
num_size_one_inner_dims++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
|
||||||
|
if (num_size_one_inner_dims == NumDims) {
|
||||||
|
*(dst.data + dst.offset) = *(src.data + src.offset);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Outermost dimension in the dst with `stride == 1` (contiguous in memory).
|
||||||
|
const IndexType dst_stride1_dim =
|
||||||
|
IsColMajor ? num_size_one_inner_dims
|
||||||
|
: NumDims - num_size_one_inner_dims - 1;
|
||||||
|
|
||||||
|
// Dimension in the src that corresponds to the dst innermost dimension.
|
||||||
|
const IndexType src_dim_for_dst_stride1_dim =
|
||||||
|
NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
|
||||||
|
|
||||||
|
// Size of the innermost dimension (length of contiguous blocks of memory).
|
||||||
|
IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
|
||||||
|
|
||||||
|
// Squeeze multiple inner dims into one if they are contiguous in `dst` and
|
||||||
|
// `src` memory, so we can do less linear copy calls.
|
||||||
|
for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
|
||||||
|
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
|
||||||
|
const IndexType dst_stride = dst.strides[dst_dim];
|
||||||
|
const IndexType src_stride = src.strides[dim_map[dst_dim]];
|
||||||
|
if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
|
||||||
|
dst_inner_dim_size *= dst.dims[dst_dim];
|
||||||
|
++num_size_one_inner_dims;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup strides to read data from `src` and write to `dst`.
|
||||||
|
IndexType input_offset = src.offset;
|
||||||
|
IndexType output_offset = dst.offset;
|
||||||
|
IndexType input_stride =
|
||||||
|
NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
|
||||||
|
IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
|
||||||
|
|
||||||
|
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||||
|
array<BlockIteratorState, at_least_1_dim> it;
|
||||||
|
|
||||||
|
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||||
|
int idx = 0; // currently initialized iterator state index
|
||||||
|
for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||||
|
const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
|
||||||
|
if (dst.dims[dst_dim] == 1) continue;
|
||||||
|
|
||||||
|
it[idx].size = dst.dims[dst_dim];
|
||||||
|
it[idx].input_stride = src.strides[dim_map[dst_dim]];
|
||||||
|
it[idx].output_stride = dst.strides[dst_dim];
|
||||||
|
|
||||||
|
it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
|
||||||
|
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
|
||||||
|
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate copying data from src to dst.
|
||||||
|
const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
|
||||||
|
|
||||||
|
for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) {
|
||||||
|
// Copy data for the innermost dimension.
|
||||||
|
LinCopy::Run(
|
||||||
|
typename LinCopy::Dst(output_offset, output_stride, dst.data),
|
||||||
|
typename LinCopy::Src(input_offset, input_stride, src.data),
|
||||||
|
dst_inner_dim_size);
|
||||||
|
|
||||||
|
// Update offsets (idx is the number of initialize block iterators).
|
||||||
|
for (int j = 0; j < idx; ++j) {
|
||||||
|
if (++it[j].count < it[j].size) {
|
||||||
|
input_offset += it[j].input_stride;
|
||||||
|
output_offset += it[j].output_stride;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
it[j].count = 0;
|
||||||
|
input_offset -= it[j].input_span;
|
||||||
|
output_offset -= it[j].output_span;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy from `src` to `dst` with an identity src->dst dimension map.
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(const Dst& dst,
|
||||||
|
const Src& src) {
|
||||||
|
Dimensions dst_to_src_map;
|
||||||
|
for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
|
||||||
|
Copy(dst, src, dst_to_src_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct BlockIteratorState {
|
||||||
|
BlockIteratorState()
|
||||||
|
: size(0),
|
||||||
|
count(0),
|
||||||
|
input_stride(0),
|
||||||
|
output_stride(0),
|
||||||
|
input_span(0),
|
||||||
|
output_span(0) {}
|
||||||
|
|
||||||
|
IndexType size;
|
||||||
|
IndexType count;
|
||||||
|
IndexType input_stride;
|
||||||
|
IndexType output_stride;
|
||||||
|
IndexType input_span;
|
||||||
|
IndexType output_span;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Compute how many inner dimensions it's allowed to squeeze when doing IO
|
||||||
|
// between two tensor blocks. It's safe to squeeze inner dimensions, only
|
||||||
|
// if they are not reordered.
|
||||||
|
static int NumSqueezableInnerDims(const Dimensions& dim_map) {
|
||||||
|
int num_squeezable_dims = 0;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
const int dim = IsColMajor ? i : NumDims - i - 1;
|
||||||
|
if (dim_map[dim] != dim) break;
|
||||||
|
num_squeezable_dims++;
|
||||||
|
}
|
||||||
|
return num_squeezable_dims;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
|
||||||
|
// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address.
|
||||||
|
//
|
||||||
|
// Currently there is no way to write from a Tensor expression to a block of
|
||||||
|
// memory, if dimensions are reordered. If you need to do that, you should
|
||||||
|
// materialize a Tensor block expression into a memory buffer, and then use
|
||||||
|
// TensorBlockIO to copy data between two memory buffers with a custom
|
||||||
|
// `dst->src` dimension map (see definition above).
|
||||||
|
//
|
||||||
|
// Also currently the innermost dimension of `dst` must have a stride '1'
|
||||||
|
// (contiguous in memory). This restriction could be lifted with a `pscatter`,
|
||||||
|
// but in practice it's never needed, and there is a similar TensorBlockIO
|
||||||
|
// workaround for that.
|
||||||
|
//
|
||||||
|
// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
|
||||||
|
// where `src` is a tensor expression. Explore if it is possible to rewrite IO
|
||||||
|
// to use expressions instead of pointers, and after that TensorBlockAssignment
|
||||||
|
// will become an alias to IO.
|
||||||
|
template <typename Scalar, int NumDims, typename TensorBlockExpr,
|
||||||
|
typename IndexType = Eigen::Index>
|
||||||
|
class TensorBlockAssignment {
|
||||||
|
// We will use coeff/packet path to evaluate block expressions.
|
||||||
|
typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
|
||||||
|
TensorBlockEvaluator;
|
||||||
|
|
||||||
|
typedef DSizes<IndexType, NumDims> Dimensions;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
Vectorizable = packet_traits<Scalar>::Vectorizable,
|
||||||
|
PacketSize = packet_traits<Scalar>::size
|
||||||
|
};
|
||||||
|
|
||||||
|
template <bool Vectorizable, typename Evaluator>
|
||||||
|
struct InnerDimAssign {
|
||||||
|
EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
|
||||||
|
const Evaluator& eval,
|
||||||
|
IndexType eval_offset) {
|
||||||
|
for (IndexType i = 0; i < count; ++i) {
|
||||||
|
dst[i] = eval.coeff(eval_offset + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Evaluator>
|
||||||
|
struct InnerDimAssign<true, Evaluator> {
|
||||||
|
EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
|
||||||
|
const Evaluator& eval,
|
||||||
|
IndexType eval_offset) {
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
|
||||||
|
const IndexType unrolled_size = count - 4 * PacketSize;
|
||||||
|
const IndexType vectorized_size = count - PacketSize;
|
||||||
|
IndexType i = 0;
|
||||||
|
|
||||||
|
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
const IndexType idx = eval_offset + i + j * PacketSize;
|
||||||
|
Packet p = eval.template packet<Unaligned>(idx);
|
||||||
|
pstoreu<Scalar>(dst + i + j * PacketSize, p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
|
Packet p = eval.template packet<Unaligned>(eval_offset + i);
|
||||||
|
pstoreu<Scalar>(dst + i, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
dst[i] = eval.coeff(eval_offset + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public:
|
||||||
|
struct Dst {
|
||||||
|
Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
|
||||||
|
IndexType dst_offset = 0)
|
||||||
|
: dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
|
||||||
|
|
||||||
|
Dimensions dims;
|
||||||
|
Dimensions strides;
|
||||||
|
Scalar* data;
|
||||||
|
IndexType offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const Dst& dst, const TensorBlockExpr& expr) {
|
||||||
|
// Prepare evaluator for block expression.
|
||||||
|
DefaultDevice default_device;
|
||||||
|
TensorBlockEvaluator eval(expr, default_device);
|
||||||
|
|
||||||
|
// Tensor block expression dimension should match destination dimensions.
|
||||||
|
eigen_assert(dimensions_match(dst.dims, eval.dimensions()));
|
||||||
|
|
||||||
|
static const int Layout = TensorBlockEvaluator::Layout;
|
||||||
|
static const bool is_col_major = Layout == ColMajor;
|
||||||
|
|
||||||
|
// Initialize output inner dimension size based on a layout.
|
||||||
|
const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
|
||||||
|
const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
|
||||||
|
IndexType output_inner_dim_size = dst.dims[inner_dim_idx];
|
||||||
|
|
||||||
|
// Dst inner dimension stride must be '1'.
|
||||||
|
eigen_assert(dst.strides[inner_dim_idx] == 1);
|
||||||
|
|
||||||
|
// Squeeze multiple inner dims into one if they are contiguous in `dst`.
|
||||||
|
IndexType num_squeezed_dims = 0;
|
||||||
|
for (Index i = 1; i < NumDims; ++i) {
|
||||||
|
const Index dim = is_col_major ? i : NumDims - i - 1;
|
||||||
|
const IndexType dst_stride = dst.strides[dim];
|
||||||
|
|
||||||
|
if (output_inner_dim_size == dst_stride) {
|
||||||
|
output_inner_dim_size *= dst.dims[dim];
|
||||||
|
num_squeezed_dims++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize output block iterator state. Dimension in this array are
|
||||||
|
// always in inner_most -> outer_most order (col major layout).
|
||||||
|
array<BlockIteratorState, NumDims> it;
|
||||||
|
|
||||||
|
int idx = 0; // currently initialized iterator state index
|
||||||
|
for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
|
||||||
|
const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
|
||||||
|
|
||||||
|
it[idx].count = 0;
|
||||||
|
it[idx].size = dst.dims[dim];
|
||||||
|
it[idx].output_stride = dst.strides[dim];
|
||||||
|
it[idx].output_span = it[i].output_stride * (it[i].size - 1);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We read block expression from the beginning, and start writing data to
|
||||||
|
// `dst` at given offset.
|
||||||
|
IndexType input_offset = 0;
|
||||||
|
IndexType output_offset = dst.offset;
|
||||||
|
|
||||||
|
// Iterate copying data from `eval` to `dst`.
|
||||||
|
for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
|
||||||
|
// Assign to `dst` at current offset.
|
||||||
|
InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
|
||||||
|
TensorBlockEvaluator>::Run(dst.data + output_offset,
|
||||||
|
output_inner_dim_size, eval,
|
||||||
|
input_offset);
|
||||||
|
|
||||||
|
// Move input offset forward by the number of assigned coefficients.
|
||||||
|
input_offset += output_inner_dim_size;
|
||||||
|
|
||||||
|
// Update index.
|
||||||
|
for (int j = 0; j < idx; ++j) {
|
||||||
|
if (++it[j].count < it[j].size) {
|
||||||
|
output_offset += it[j].output_stride;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
it[j].count = 0;
|
||||||
|
output_offset -= it[j].output_span;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct BlockIteratorState {
|
||||||
|
BlockIteratorState()
|
||||||
|
: count(0), size(0), output_stride(0), output_span(0) {}
|
||||||
|
|
||||||
|
IndexType count;
|
||||||
|
IndexType size;
|
||||||
|
IndexType output_stride;
|
||||||
|
IndexType output_span;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
|
339
unsupported/test/cxx11_tensor_block_eval.cpp
Normal file
339
unsupported/test/cxx11_tensor_block_eval.cpp
Normal file
@ -0,0 +1,339 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
using Eigen::internal::TensorBlockDescriptor;
|
||||||
|
using Eigen::internal::TensorExecutor;
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// Utility functions to generate random tensors, blocks, and evaluate them.
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
|
||||||
|
DSizes<Index, NumDims> dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
dims[i] = internal::random<Index>(min, max);
|
||||||
|
}
|
||||||
|
return DSizes<Index, NumDims>(dims);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block offsets and extents allows to construct a TensorSlicingOp corresponding
|
||||||
|
// to a TensorBlockDescriptor.
|
||||||
|
template <int NumDims>
|
||||||
|
struct TensorBlockParams {
|
||||||
|
DSizes<Index, NumDims> offsets;
|
||||||
|
DSizes<Index, NumDims> sizes;
|
||||||
|
TensorBlockDescriptor<NumDims, Index> desc;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <int Layout, int NumDims>
|
||||||
|
static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
|
||||||
|
Index min, Index max) {
|
||||||
|
// Choose random offsets and sizes along all tensor dimensions.
|
||||||
|
DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
|
||||||
|
DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
|
||||||
|
|
||||||
|
// Make sure that offset + size do not overflow dims.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
|
||||||
|
sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Index offset = 0;
|
||||||
|
DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
offset += strides[i] * offsets[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate block with block sizes skewed towards inner dimensions. This type of
|
||||||
|
// block is required for evaluating broadcast expressions.
|
||||||
|
template <int Layout, int NumDims>
|
||||||
|
static TensorBlockParams<NumDims> SkewedInnerBlock(
|
||||||
|
DSizes<Index, NumDims> dims) {
|
||||||
|
using BlockMapper = internal::TensorBlockMapper<int, Index, NumDims, Layout>;
|
||||||
|
BlockMapper block_mapper(dims,
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
internal::random<Index>(1, dims.TotalSize()));
|
||||||
|
|
||||||
|
Index total_blocks = block_mapper.total_block_count();
|
||||||
|
Index block_index = internal::random<Index>(0, total_blocks - 1);
|
||||||
|
auto block = block_mapper.GetBlockForIndex(block_index, nullptr);
|
||||||
|
DSizes<Index, NumDims> sizes = block.block_sizes();
|
||||||
|
|
||||||
|
auto strides = internal::strides<Layout>(dims);
|
||||||
|
DSizes<Index, NumDims> offsets;
|
||||||
|
|
||||||
|
// Compute offsets for the first block coefficient.
|
||||||
|
Index index = block.first_coeff_index();
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
|
const Index idx = index / strides[i];
|
||||||
|
index -= idx * strides[i];
|
||||||
|
offsets[i] = idx;
|
||||||
|
}
|
||||||
|
offsets[0] = index;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
|
const Index idx = index / strides[i];
|
||||||
|
index -= idx * strides[i];
|
||||||
|
offsets[i] = idx;
|
||||||
|
}
|
||||||
|
offsets[NumDims - 1] = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto desc = TensorBlockDescriptor<NumDims>(block.first_coeff_index(), sizes);
|
||||||
|
return {offsets, sizes, desc};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
|
||||||
|
DSizes<Index, NumDims> offsets;
|
||||||
|
for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
|
||||||
|
|
||||||
|
return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// Verify that block expression evaluation produces the same result as a
|
||||||
|
// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout, typename Expression,
|
||||||
|
typename GenBlockParams>
|
||||||
|
static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
|
||||||
|
using Device = DefaultDevice;
|
||||||
|
auto d = Device();
|
||||||
|
|
||||||
|
// Scratch memory allocator for block evaluation.
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
TensorBlockScratch scratch(d);
|
||||||
|
|
||||||
|
// TensorEvaluator is needed to produce tensor blocks of the expression.
|
||||||
|
auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
|
||||||
|
|
||||||
|
// Choose a random offsets, sizes and TensorBlockDescriptor.
|
||||||
|
TensorBlockParams<NumDims> block_params = gen_block();
|
||||||
|
|
||||||
|
// Evaluate TensorBlock expression into a tensor.
|
||||||
|
Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
|
||||||
|
|
||||||
|
// Maybe use this tensor as a block desc destination.
|
||||||
|
Tensor<T, NumDims, Layout> dst(block_params.desc.dimensions());
|
||||||
|
if (internal::random<bool>()) {
|
||||||
|
block_params.desc.template AddDestinationBuffer(
|
||||||
|
dst.data(), internal::strides<Layout>(dst.dimensions()),
|
||||||
|
dst.dimensions().TotalSize() * sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto tensor_block = eval.blockV2(block_params.desc, scratch);
|
||||||
|
auto b_expr = tensor_block.expr();
|
||||||
|
|
||||||
|
// We explicitly disable vectorization and tiling, to run a simple coefficient
|
||||||
|
// wise assignment loop, because it's very simple and should be correct.
|
||||||
|
using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
|
||||||
|
using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
|
||||||
|
internal::TiledEvaluation::Off>;
|
||||||
|
BlockExecutor::run(BlockAssign(block, b_expr), d);
|
||||||
|
|
||||||
|
// Cleanup temporary buffers owned by a tensor block.
|
||||||
|
tensor_block.cleanup();
|
||||||
|
|
||||||
|
// Compute a Tensor slice corresponding to a Tensor block.
|
||||||
|
Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
|
||||||
|
auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
|
||||||
|
|
||||||
|
// Explicitly use coefficient assignment to evaluate slice expression.
|
||||||
|
using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
|
||||||
|
using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
|
||||||
|
internal::TiledEvaluation::Off>;
|
||||||
|
SliceExecutor::run(SliceAssign(slice, s_expr), d);
|
||||||
|
|
||||||
|
// Tensor block and tensor slice must be the same.
|
||||||
|
for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_block() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
// Identity tensor expression transformation.
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_unary_expr_block() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.square(), [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_binary_expr_block() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
|
||||||
|
lhs.setRandom();
|
||||||
|
rhs.setRandom();
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
lhs + rhs, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_binary_with_unary_expr_block() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
|
||||||
|
lhs.setRandom();
|
||||||
|
rhs.setRandom();
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
(lhs.square() + rhs.square()).sqrt(),
|
||||||
|
[&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_broadcast() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
|
||||||
|
|
||||||
|
DSizes<Index, NumDims> bcasted_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.broadcast(bcast),
|
||||||
|
[&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.broadcast(bcast),
|
||||||
|
[&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
|
||||||
|
|
||||||
|
// Check that desc.destination() memory is not shared between two broadcast
|
||||||
|
// materializations.
|
||||||
|
VerifyBlockEvaluator<T, NumDims, Layout>(
|
||||||
|
input.broadcast(bcast) + input.square().broadcast(bcast),
|
||||||
|
[&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// Verify that assigning block to a Tensor expression produces the same result
|
||||||
|
// as an assignment to TensorSliceOp (writing a block is is identical to
|
||||||
|
// assigning one tensor to a slice of another tensor).
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout, typename Expression,
|
||||||
|
typename GenBlockParams>
|
||||||
|
static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
|
||||||
|
Expression expr, GenBlockParams gen_block) {
|
||||||
|
using Device = DefaultDevice;
|
||||||
|
auto d = Device();
|
||||||
|
|
||||||
|
// We use tensor evaluator as a target for block and slice assignments.
|
||||||
|
auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
|
||||||
|
|
||||||
|
// Generate a random block, or choose a block that fits in full expression.
|
||||||
|
TensorBlockParams<NumDims> block_params = gen_block();
|
||||||
|
|
||||||
|
// Generate random data of the selected block size.
|
||||||
|
Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
|
||||||
|
block.setRandom();
|
||||||
|
|
||||||
|
// ************************************************************************ //
|
||||||
|
// (1) Assignment from a block.
|
||||||
|
|
||||||
|
// Construct a materialize block from a random generated block tensor.
|
||||||
|
internal::TensorMaterializedBlock<T, NumDims, Layout> blk(
|
||||||
|
internal::TensorBlockKind::kView, block.data(), block.dimensions());
|
||||||
|
|
||||||
|
// Reset all underlying tensor values to zero.
|
||||||
|
tensor.setZero();
|
||||||
|
|
||||||
|
// Use evaluator to write block into a tensor.
|
||||||
|
eval.writeBlockV2(block_params.desc, blk);
|
||||||
|
|
||||||
|
// Make a copy of the result after assignment.
|
||||||
|
Tensor<T, NumDims, Layout> block_assigned = tensor;
|
||||||
|
|
||||||
|
// ************************************************************************ //
|
||||||
|
// (2) Assignment to a slice
|
||||||
|
|
||||||
|
// Reset all underlying tensor values to zero.
|
||||||
|
tensor.setZero();
|
||||||
|
|
||||||
|
// Assign block to a slice of original expression
|
||||||
|
auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
|
||||||
|
|
||||||
|
// Explicitly use coefficient assignment to evaluate slice expression.
|
||||||
|
using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
|
||||||
|
using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
|
||||||
|
internal::TiledEvaluation::Off>;
|
||||||
|
SliceExecutor::run(SliceAssign(s_expr, block), d);
|
||||||
|
|
||||||
|
// Make a copy of the result after assignment.
|
||||||
|
Tensor<T, NumDims, Layout> slice_assigned = tensor;
|
||||||
|
|
||||||
|
for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_assign_tensor_block() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> tensor(dims);
|
||||||
|
|
||||||
|
TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout>(
|
||||||
|
tensor, map, [&dims]() { return FixedSizeBlock(dims); });
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
|
//#define CALL_SUBTESTS(NAME) CALL_SUBTEST((NAME<float, 2, RowMajor>()))
|
||||||
|
|
||||||
|
#define CALL_SUBTESTS(NAME) \
|
||||||
|
CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 5, ColMajor>()))
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
|
||||||
|
// clang-format off
|
||||||
|
CALL_SUBTESTS(test_eval_tensor_block);
|
||||||
|
CALL_SUBTESTS(test_eval_tensor_unary_expr_block);
|
||||||
|
CALL_SUBTESTS(test_eval_tensor_binary_expr_block);
|
||||||
|
CALL_SUBTESTS(test_eval_tensor_binary_with_unary_expr_block);
|
||||||
|
CALL_SUBTESTS(test_eval_tensor_broadcast);
|
||||||
|
|
||||||
|
CALL_SUBTESTS(test_assign_tensor_block);
|
||||||
|
// clang-format on
|
||||||
|
}
|
438
unsupported/test/cxx11_tensor_block_io.cpp
Normal file
438
unsupported/test/cxx11_tensor_block_io.cpp
Normal file
@ -0,0 +1,438 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------- //
|
||||||
|
// A set of tests for TensorBlockIO: copying data between tensor blocks.
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
|
||||||
|
DSizes<Index, NumDims> dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
dims[i] = internal::random<Index>(min, max);
|
||||||
|
}
|
||||||
|
return DSizes<Index, NumDims>(dims);
|
||||||
|
}
|
||||||
|
|
||||||
|
static internal::TensorBlockShapeType RandomBlockShape() {
|
||||||
|
return internal::random<bool>() ? internal::kUniformAllDims
|
||||||
|
: internal::kSkewedInnerDims;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static Index RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
|
||||||
|
return internal::random<Index>(1, dims.TotalSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int Layout, int NumDims>
|
||||||
|
static Index GetInputIndex(Index output_index,
|
||||||
|
const array<Index, NumDims>& output_to_input_dim_map,
|
||||||
|
const array<Index, NumDims>& input_strides,
|
||||||
|
const array<Index, NumDims>& output_strides) {
|
||||||
|
int input_index = 0;
|
||||||
|
if (Layout == ColMajor) {
|
||||||
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
|
const Index idx = output_index / output_strides[i];
|
||||||
|
input_index += idx * input_strides[output_to_input_dim_map[i]];
|
||||||
|
output_index -= idx * output_strides[i];
|
||||||
|
}
|
||||||
|
return input_index +
|
||||||
|
output_index * input_strides[output_to_input_dim_map[0]];
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
|
const Index idx = output_index / output_strides[i];
|
||||||
|
input_index += idx * input_strides[output_to_input_dim_map[i]];
|
||||||
|
output_index -= idx * output_strides[i];
|
||||||
|
}
|
||||||
|
return input_index +
|
||||||
|
output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_block_io_copy_data_from_source_to_target() {
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<T, Index, NumDims, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
// Generate a random input Tensor.
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
// Write data to an output Tensor.
|
||||||
|
Tensor<T, NumDims, Layout> output(dims);
|
||||||
|
|
||||||
|
// Construct a tensor block mapper.
|
||||||
|
using TensorBlockMapper =
|
||||||
|
internal::TensorBlockMapper<T, Index, NumDims, Layout>;
|
||||||
|
TensorBlockMapper block_mapper(dims, RandomBlockShape(),
|
||||||
|
RandomTargetBlockSize(dims));
|
||||||
|
|
||||||
|
// We will copy data from input to output through this buffer.
|
||||||
|
Tensor<T, NumDims, Layout> block(block_mapper.block_dim_sizes());
|
||||||
|
|
||||||
|
// Precompute strides for TensorBlockIO::Copy.
|
||||||
|
auto input_strides = internal::strides<Layout>(dims);
|
||||||
|
auto output_strides = internal::strides<Layout>(dims);
|
||||||
|
|
||||||
|
const T* input_data = input.data();
|
||||||
|
T* output_data = output.data();
|
||||||
|
T* block_data = block.data();
|
||||||
|
|
||||||
|
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
|
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||||
|
TensorBlock blk = block_mapper.GetBlockForIndex(i, block_data);
|
||||||
|
|
||||||
|
auto blk_dims = blk.block_sizes();
|
||||||
|
auto blk_strides = internal::strides<Layout>(blk_dims);
|
||||||
|
|
||||||
|
{
|
||||||
|
// Read from input into a block buffer.
|
||||||
|
IODst dst(blk_dims, blk_strides, block_data, 0);
|
||||||
|
IOSrc src(input_strides, input_data, blk.first_coeff_index());
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Write from block buffer to output.
|
||||||
|
IODst dst(blk_dims, output_strides, output_data, blk.first_coeff_index());
|
||||||
|
IOSrc src(blk_strides, block_data, 0);
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < dims.TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_block_io_copy_using_reordered_dimensions() {
|
||||||
|
// Generate a random input Tensor.
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
// Create a random dimension re-ordering/shuffle.
|
||||||
|
std::vector<int> shuffle;
|
||||||
|
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||||
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
|
||||||
|
|
||||||
|
DSizes<Index, NumDims> output_tensor_dims;
|
||||||
|
DSizes<Index, NumDims> input_to_output_dim_map;
|
||||||
|
DSizes<Index, NumDims> output_to_input_dim_map;
|
||||||
|
for (Index i = 0; i < NumDims; ++i) {
|
||||||
|
output_tensor_dims[shuffle[i]] = dims[i];
|
||||||
|
input_to_output_dim_map[i] = shuffle[i];
|
||||||
|
output_to_input_dim_map[shuffle[i]] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write data to an output Tensor.
|
||||||
|
Tensor<T, NumDims, Layout> output(output_tensor_dims);
|
||||||
|
|
||||||
|
// Construct a tensor block mapper.
|
||||||
|
// NOTE: Tensor block mapper works with shuffled dimensions.
|
||||||
|
using TensorBlockMapper =
|
||||||
|
internal::TensorBlockMapper<T, Index, NumDims, Layout>;
|
||||||
|
TensorBlockMapper block_mapper(output_tensor_dims, RandomBlockShape(),
|
||||||
|
RandomTargetBlockSize(output_tensor_dims));
|
||||||
|
|
||||||
|
// We will copy data from input to output through this buffer.
|
||||||
|
Tensor<T, NumDims, Layout> block(block_mapper.block_dim_sizes());
|
||||||
|
|
||||||
|
// Precompute strides for TensorBlockIO::Copy.
|
||||||
|
auto input_strides = internal::strides<Layout>(dims);
|
||||||
|
auto output_strides = internal::strides<Layout>(output_tensor_dims);
|
||||||
|
|
||||||
|
const T* input_data = input.data();
|
||||||
|
T* output_data = output.data();
|
||||||
|
T* block_data = block.data();
|
||||||
|
|
||||||
|
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
|
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||||
|
TensorBlock blk = block_mapper.GetBlockForIndex(i, block_data);
|
||||||
|
|
||||||
|
const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
|
||||||
|
blk.first_coeff_index(), output_to_input_dim_map, input_strides,
|
||||||
|
output_strides);
|
||||||
|
|
||||||
|
// NOTE: Block dimensions are in the same order as output dimensions.
|
||||||
|
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<T, Index, NumDims, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
auto blk_dims = blk.block_sizes();
|
||||||
|
auto blk_strides = internal::strides<Layout>(blk_dims);
|
||||||
|
|
||||||
|
{
|
||||||
|
// Read from input into a block buffer.
|
||||||
|
IODst dst(blk_dims, blk_strides, block_data, 0);
|
||||||
|
IOSrc src(input_strides, input_data, first_coeff_index);
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src,
|
||||||
|
/*dst_to_src_dim_map=*/output_to_input_dim_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// We need to convert block dimensions from output to input order.
|
||||||
|
auto dst_dims = blk_dims;
|
||||||
|
for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
|
||||||
|
dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write from block buffer to output.
|
||||||
|
IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
|
||||||
|
IOSrc src(blk_strides, block_data, 0);
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src,
|
||||||
|
/*dst_to_src_dim_map=*/input_to_output_dim_map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Index i = 0; i < dims.TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the special case for reading data with reordering, when dimensions
|
||||||
|
// before/after reordering are the same. Squeezing reads along inner dimensions
|
||||||
|
// in this case is illegal, because we reorder innermost dimension.
|
||||||
|
template <int Layout>
|
||||||
|
static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
|
||||||
|
DSizes<Index, 3> tensor_dims(7, 9, 7);
|
||||||
|
DSizes<Index, 3> block_dims = tensor_dims;
|
||||||
|
|
||||||
|
DSizes<Index, 3> block_to_tensor_dim;
|
||||||
|
block_to_tensor_dim[0] = 2;
|
||||||
|
block_to_tensor_dim[1] = 1;
|
||||||
|
block_to_tensor_dim[2] = 0;
|
||||||
|
|
||||||
|
auto tensor_strides = internal::strides<Layout>(tensor_dims);
|
||||||
|
auto block_strides = internal::strides<Layout>(block_dims);
|
||||||
|
|
||||||
|
Tensor<float, 3, Layout> block(block_dims);
|
||||||
|
Tensor<float, 3, Layout> tensor(tensor_dims);
|
||||||
|
tensor.setRandom();
|
||||||
|
|
||||||
|
float* tensor_data = tensor.data();
|
||||||
|
float* block_data = block.data();
|
||||||
|
|
||||||
|
typedef internal::TensorBlock<float, Index, 3, Layout> TensorBlock;
|
||||||
|
TensorBlock blk(0, block_dims, block_strides, tensor_strides, block_data);
|
||||||
|
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 3, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
// Read from a tensor into a block.
|
||||||
|
IODst dst(blk.block_sizes(), block_strides, block_data, 0);
|
||||||
|
IOSrc src(tensor_strides, tensor_data, blk.first_coeff_index());
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src,
|
||||||
|
/*dst_to_src_dim_map=*/block_to_tensor_dim);
|
||||||
|
|
||||||
|
TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
|
||||||
|
TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
|
||||||
|
|
||||||
|
for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
|
||||||
|
for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
|
||||||
|
for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
|
||||||
|
float block_value = block_tensor(d2, d1, d0);
|
||||||
|
float tensor_value = tensor_tensor(d0, d1, d2);
|
||||||
|
VERIFY_IS_EQUAL(block_value, tensor_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the special case for reading data with reordering, when dimensions
|
||||||
|
// before/after reordering are the same. Squeezing reads in this case is allowed
|
||||||
|
// because we reorder outer dimensions.
|
||||||
|
template <int Layout>
|
||||||
|
static void test_block_io_copy_using_reordered_dimensions_squeeze() {
|
||||||
|
DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
|
||||||
|
DSizes<Index, 4> block_dims = tensor_dims;
|
||||||
|
|
||||||
|
DSizes<Index, 4> block_to_tensor_dim;
|
||||||
|
block_to_tensor_dim[0] = 0;
|
||||||
|
block_to_tensor_dim[1] = 1;
|
||||||
|
block_to_tensor_dim[2] = 3;
|
||||||
|
block_to_tensor_dim[3] = 2;
|
||||||
|
|
||||||
|
auto tensor_strides = internal::strides<Layout>(tensor_dims);
|
||||||
|
auto block_strides = internal::strides<Layout>(block_dims);
|
||||||
|
|
||||||
|
Tensor<float, 4, Layout> block(block_dims);
|
||||||
|
Tensor<float, 4, Layout> tensor(tensor_dims);
|
||||||
|
tensor.setRandom();
|
||||||
|
|
||||||
|
float* tensor_data = tensor.data();
|
||||||
|
float* block_data = block.data();
|
||||||
|
|
||||||
|
typedef internal::TensorBlock<float, Index, 4, Layout> TensorBlock;
|
||||||
|
TensorBlock blk(0, block_dims, block_strides, tensor_strides, block_data);
|
||||||
|
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 4, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
// Read from a tensor into a block.
|
||||||
|
IODst dst(blk.block_sizes(), block_strides, block_data, 0);
|
||||||
|
IOSrc src(tensor_strides, tensor_data, blk.first_coeff_index());
|
||||||
|
|
||||||
|
TensorBlockIO::Copy(dst, src,
|
||||||
|
/*dst_to_src_dim_map=*/block_to_tensor_dim);
|
||||||
|
|
||||||
|
TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
|
||||||
|
TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
|
||||||
|
|
||||||
|
for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
|
||||||
|
for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
|
||||||
|
for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
|
||||||
|
for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
|
||||||
|
float block_value = block_tensor(d0, d1, d3, d2);
|
||||||
|
float tensor_value = tensor_tensor(d0, d1, d2, d3);
|
||||||
|
VERIFY_IS_EQUAL(block_value, tensor_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int Layout>
|
||||||
|
static void test_block_io_zero_stride() {
|
||||||
|
DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
|
||||||
|
|
||||||
|
DSizes<Index, 5> input_tensor_dims = rnd_dims;
|
||||||
|
input_tensor_dims[0] = 1;
|
||||||
|
input_tensor_dims[2] = 1;
|
||||||
|
input_tensor_dims[4] = 1;
|
||||||
|
|
||||||
|
Tensor<float, 5, Layout> input(input_tensor_dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
DSizes<Index, 5> output_tensor_dims = rnd_dims;
|
||||||
|
|
||||||
|
auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
|
||||||
|
auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
|
||||||
|
|
||||||
|
auto input_tensor_strides_with_zeros = input_tensor_strides;
|
||||||
|
input_tensor_strides_with_zeros[0] = 0;
|
||||||
|
input_tensor_strides_with_zeros[2] = 0;
|
||||||
|
input_tensor_strides_with_zeros[4] = 0;
|
||||||
|
|
||||||
|
Tensor<float, 5, Layout> output(output_tensor_dims);
|
||||||
|
output.setRandom();
|
||||||
|
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 5, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
// Write data from input to output with broadcasting in dims [0, 2, 4].
|
||||||
|
IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
|
||||||
|
IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
|
||||||
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
|
||||||
|
for (int i = 0; i < output_tensor_dims[0]; ++i) {
|
||||||
|
for (int j = 0; j < output_tensor_dims[1]; ++j) {
|
||||||
|
for (int k = 0; k < output_tensor_dims[2]; ++k) {
|
||||||
|
for (int l = 0; l < output_tensor_dims[3]; ++l) {
|
||||||
|
for (int m = 0; m < output_tensor_dims[4]; ++m) {
|
||||||
|
float input_value = input(0, j, 0, l, 0);
|
||||||
|
float output_value = output(i, j, k, l, m);
|
||||||
|
VERIFY_IS_EQUAL(input_value, output_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int Layout>
|
||||||
|
static void test_block_io_squeeze_ones() {
|
||||||
|
using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 5, Layout>;
|
||||||
|
using IODst = typename TensorBlockIO::Dst;
|
||||||
|
using IOSrc = typename TensorBlockIO::Src;
|
||||||
|
|
||||||
|
// Total size > 1.
|
||||||
|
{
|
||||||
|
DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
|
||||||
|
auto strides = internal::strides<Layout>(block_sizes);
|
||||||
|
|
||||||
|
// Create a random input tensor.
|
||||||
|
Tensor<float, 5> input(block_sizes);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Tensor<float, 5> output(block_sizes);
|
||||||
|
|
||||||
|
IODst dst(block_sizes, strides, output.data(), 0);
|
||||||
|
IOSrc src(strides, input.data());
|
||||||
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
|
||||||
|
for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total size == 1.
|
||||||
|
{
|
||||||
|
DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
|
||||||
|
auto strides = internal::strides<Layout>(block_sizes);
|
||||||
|
|
||||||
|
// Create a random input tensor.
|
||||||
|
Tensor<float, 5> input(block_sizes);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Tensor<float, 5> output(block_sizes);
|
||||||
|
|
||||||
|
IODst dst(block_sizes, strides, output.data(), 0);
|
||||||
|
IOSrc src(strides, input.data());
|
||||||
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
|
||||||
|
for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_SUBTESTS(NAME) \
|
||||||
|
CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<float, 5, ColMajor>()))
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
|
||||||
|
// clang-format off
|
||||||
|
CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
|
||||||
|
CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
|
||||||
|
CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
|
||||||
|
CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
|
||||||
|
CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
|
||||||
|
CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
|
||||||
|
// clang-format on
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user