mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-22 12:54:26 +08:00
Add block evaluation support to TensorOps
This commit is contained in:
parent
d6568425f8
commit
83c0a16baf
@ -214,7 +214,7 @@ class TensorBlockIO {
|
|||||||
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
||||||
const StorageIndex block_dim_for_tensor_stride1_dim =
|
const StorageIndex block_dim_for_tensor_stride1_dim =
|
||||||
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
||||||
size_t block_inner_dim_size =
|
Index block_inner_dim_size =
|
||||||
NumDims == 0 ? 1
|
NumDims == 0 ? 1
|
||||||
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
||||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||||
@ -745,16 +745,15 @@ class TensorBlockMapper {
|
|||||||
if (block_shape == TensorBlockShapeType::kUniformAllDims) {
|
if (block_shape == TensorBlockShapeType::kUniformAllDims) {
|
||||||
// Tensor will not fit within 'min_target_size' budget: calculate tensor
|
// Tensor will not fit within 'min_target_size' budget: calculate tensor
|
||||||
// block dimension sizes based on "square" dimension size target.
|
// block dimension sizes based on "square" dimension size target.
|
||||||
const size_t dim_size_target = static_cast<const size_t>(
|
const Index dim_size_target = static_cast<Index>(
|
||||||
std::pow(static_cast<float>(min_target_size),
|
std::pow(static_cast<float>(min_target_size),
|
||||||
1.0 / static_cast<float>(block_dim_sizes.rank())));
|
1.0 / static_cast<float>(block_dim_sizes.rank())));
|
||||||
for (size_t i = 0; i < block_dim_sizes.rank(); ++i) {
|
for (Index i = 0; i < block_dim_sizes.rank(); ++i) {
|
||||||
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
|
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
|
||||||
// a multiple of the packet size. Note that reducing
|
// a multiple of the packet size. Note that reducing
|
||||||
// 'block_dim_size' in this manner can increase the number of
|
// 'block_dim_size' in this manner can increase the number of
|
||||||
// blocks, and so will amplify any per-block overhead.
|
// blocks, and so will amplify any per-block overhead.
|
||||||
block_dim_sizes[i] = numext::mini(
|
block_dim_sizes[i] = numext::mini(dim_size_target, tensor_dims[i]);
|
||||||
dim_size_target, static_cast<size_t>(tensor_dims[i]));
|
|
||||||
}
|
}
|
||||||
// Add any un-allocated coefficients to inner dimension(s).
|
// Add any un-allocated coefficients to inner dimension(s).
|
||||||
StorageIndex total_size = block_dim_sizes.TotalSize();
|
StorageIndex total_size = block_dim_sizes.TotalSize();
|
||||||
@ -789,9 +788,8 @@ class TensorBlockMapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eigen_assert(
|
eigen_assert(block_dim_sizes.TotalSize() >=
|
||||||
block_dim_sizes.TotalSize() >=
|
numext::mini<Index>(min_target_size, tensor_dims.TotalSize()));
|
||||||
numext::mini<size_t>(min_target_size, tensor_dims.TotalSize()));
|
|
||||||
|
|
||||||
return block_dim_sizes;
|
return block_dim_sizes;
|
||||||
}
|
}
|
||||||
|
@ -108,16 +108,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
bool isCopy= false, nByOne = false, oneByN = false;
|
bool isCopy= false, nByOne = false, oneByN = false;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
: m_broadcast(op.broadcast()),m_impl(op.expression(), device)
|
|
||||||
{
|
// Block based access to the XprType (input) tensor.
|
||||||
|
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
// We do block based broadcasting using a a trick with 2x tensor rank and 0
|
||||||
|
// strides. See block method implementation for details.
|
||||||
|
using BroadcastDimensions = DSizes<Index, 2 * NumDims>;
|
||||||
|
using BroadcastTensorBlock = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
|
||||||
|
using BroadcastTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||||
|
const Device& device)
|
||||||
|
: m_device(device),
|
||||||
|
m_broadcast(op.broadcast()),
|
||||||
|
m_impl(op.expression(), device) {
|
||||||
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
|
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
|
||||||
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
|
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
|
||||||
// tensor with N >= 1 of 1 element first and then broadcast.
|
// tensor with N >= 1 of 1 element first and then broadcast.
|
||||||
@ -216,8 +229,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: attempt to speed this up. The integer divisions and modulo are slow
|
// TODO: attempt to speed this up. The integer divisions and modulo are slow
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
|
||||||
{
|
|
||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_outputStrides[i];
|
||||||
@ -243,11 +255,15 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
inputIndex += (index % m_impl.dimensions()[0]);
|
inputIndex += (index % m_impl.dimensions()[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return m_impl.coeff(inputIndex);
|
return inputIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
|
||||||
{
|
{
|
||||||
|
return m_impl.coeff(indexColMajor(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
|
||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_outputStrides[i];
|
||||||
@ -263,17 +279,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
|
if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
|
||||||
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
|
eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
|
||||||
inputIndex += index;
|
inputIndex += index;
|
||||||
} else {
|
} else {
|
||||||
if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
|
if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
|
||||||
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
|
eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
|
||||||
} else {
|
} else {
|
||||||
inputIndex += (index % m_impl.dimensions()[NumDims-1]);
|
inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return m_impl.coeff(inputIndex);
|
return inputIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
|
||||||
|
{
|
||||||
|
return m_impl.coeff(indexRowMajor(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int LoadMode>
|
template<int LoadMode>
|
||||||
@ -553,13 +574,291 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
// TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
|
||||||
|
// tensors. But this might need further tuning.
|
||||||
|
Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
|
Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars);
|
||||||
|
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
block_total_size_max));
|
||||||
|
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
TensorBlock* output_block) const {
|
||||||
|
if (NumDims <= 0) {
|
||||||
|
output_block->data()[0] = m_impl.coeff(0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Because we only support kSkewedInnerDims blocking, block size should be
|
||||||
|
// equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
|
||||||
|
// for the first outer dim, and 1 for other outer dims. This is guaranteed
|
||||||
|
// by MergeResourceRequirements() in TensorBlock.h.
|
||||||
|
const auto& output_block_sizes = output_block->block_sizes();
|
||||||
|
const auto& output_block_strides = output_block->block_strides();
|
||||||
|
|
||||||
|
// Find where outer dims start.
|
||||||
|
int outer_dim_start = 0;
|
||||||
|
Index outer_dim_size = 1, inner_dim_size = 1;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i
|
||||||
|
: NumDims - i - 1;
|
||||||
|
if (i > outer_dim_start) {
|
||||||
|
eigen_assert(output_block_sizes[dim] == 1);
|
||||||
|
} else if (output_block_sizes[dim] != m_dimensions[dim]) {
|
||||||
|
eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
|
||||||
|
outer_dim_size = output_block_sizes[dim];
|
||||||
|
} else {
|
||||||
|
inner_dim_size *= output_block_sizes[dim];
|
||||||
|
++outer_dim_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inner_dim_size == 0 || outer_dim_size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& input_dims = m_impl.dimensions();
|
||||||
|
|
||||||
|
// Pre-fill input_block_sizes, broadcast_block_sizes,
|
||||||
|
// broadcast_block_strides, and broadcast_tensor_strides. Later on we will
|
||||||
|
// only modify the outer_dim_start-th dimension on these arrays.
|
||||||
|
|
||||||
|
// Calculate the input block size for looking into the input.
|
||||||
|
Dimensions input_block_sizes;
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
for (int i = 0; i < outer_dim_start; ++i) {
|
||||||
|
input_block_sizes[i] = input_dims[i];
|
||||||
|
}
|
||||||
|
for (int i = outer_dim_start; i < NumDims; ++i) {
|
||||||
|
input_block_sizes[i] = 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < outer_dim_start; ++i) {
|
||||||
|
input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
|
||||||
|
}
|
||||||
|
for (int i = outer_dim_start; i < NumDims; ++i) {
|
||||||
|
input_block_sizes[NumDims - i - 1] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Broadcast with the 0-stride trick: Create 1 extra dim for each
|
||||||
|
// broadcast, set the input stride to 0.
|
||||||
|
//
|
||||||
|
// When ColMajor:
|
||||||
|
// - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
|
||||||
|
//
|
||||||
|
// - broadcast_block_strides is [output_block_strides[0],
|
||||||
|
// output_block_strides[0] * d_0,
|
||||||
|
// output_block_strides[1],
|
||||||
|
// output_block_strides[1] * d_1,
|
||||||
|
// ...].
|
||||||
|
//
|
||||||
|
// - broadcast_tensor_strides is [output_block_strides[0],
|
||||||
|
// 0,
|
||||||
|
// output_block_strides[1],
|
||||||
|
// 0,
|
||||||
|
// ...].
|
||||||
|
BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
|
||||||
|
broadcast_tensor_strides;
|
||||||
|
|
||||||
|
for (int i = 0; i < outer_dim_start; ++i) {
|
||||||
|
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i
|
||||||
|
: NumDims - i - 1;
|
||||||
|
const int copy_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 2 * i
|
||||||
|
: 2 * NumDims - 2 * i - 1;
|
||||||
|
const int broadcast_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
|
||||||
|
: copy_dim - 1;
|
||||||
|
broadcast_block_sizes[copy_dim] = input_dims[dim];
|
||||||
|
broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
|
||||||
|
broadcast_block_strides[copy_dim] = output_block_strides[dim];
|
||||||
|
broadcast_block_strides[broadcast_dim] =
|
||||||
|
output_block_strides[dim] * input_dims[dim];
|
||||||
|
broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
|
||||||
|
broadcast_tensor_strides[broadcast_dim] = 0;
|
||||||
|
}
|
||||||
|
for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
|
||||||
|
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i
|
||||||
|
: 2 * NumDims - i - 1;
|
||||||
|
broadcast_block_sizes[dim] = 1;
|
||||||
|
broadcast_block_strides[dim] = 0;
|
||||||
|
broadcast_tensor_strides[dim] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? outer_dim_start
|
||||||
|
: NumDims - outer_dim_start - 1;
|
||||||
|
|
||||||
|
if (outer_dim_size == 1) {
|
||||||
|
// We just need one block read using the ready-set values above.
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||||
|
output_block);
|
||||||
|
} else if (input_dims[outer_dim] == 1) {
|
||||||
|
// Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
|
||||||
|
const int broadcast_outer_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 2 * outer_dim_start + 1
|
||||||
|
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||||
|
broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
|
||||||
|
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||||
|
broadcast_block_strides[broadcast_outer_dim] =
|
||||||
|
output_block_strides[outer_dim];
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||||
|
output_block);
|
||||||
|
} else {
|
||||||
|
// The general case. Let's denote the output block as x[...,
|
||||||
|
// a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
|
||||||
|
// the outer_dim_start-th dimension (< NumDims). We need to split the
|
||||||
|
// a:a+outer_dim_size into possibly 3 sub-blocks:
|
||||||
|
//
|
||||||
|
// (1) a:b, where b is the smallest multiple of
|
||||||
|
// input_dims[outer_dim_start] in [a, a+outer_dim_size].
|
||||||
|
//
|
||||||
|
// (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
|
||||||
|
// in [a, a+outer_dim_size].
|
||||||
|
//
|
||||||
|
// (3) c:a+outer_dim_size .
|
||||||
|
//
|
||||||
|
// Or, when b and c do not exist, we just need to process the whole block
|
||||||
|
// together.
|
||||||
|
|
||||||
|
// Find a.
|
||||||
|
const Index outer_dim_left_index =
|
||||||
|
output_block->first_coeff_index() / m_outputStrides[outer_dim];
|
||||||
|
|
||||||
|
// Find b and c.
|
||||||
|
const Index input_outer_dim_size = input_dims[outer_dim];
|
||||||
|
|
||||||
|
// First multiple after a. This is b when <= outer_dim_left_index +
|
||||||
|
// outer_dim_size.
|
||||||
|
const Index first_multiple =
|
||||||
|
divup<Index>(outer_dim_left_index, input_outer_dim_size) *
|
||||||
|
input_outer_dim_size;
|
||||||
|
|
||||||
|
if (first_multiple <= outer_dim_left_index + outer_dim_size) {
|
||||||
|
// b exists, so does c. Find it.
|
||||||
|
const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
|
||||||
|
input_outer_dim_size * input_outer_dim_size;
|
||||||
|
const int copy_outer_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 2 * outer_dim_start
|
||||||
|
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||||
|
const int broadcast_outer_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 2 * outer_dim_start + 1
|
||||||
|
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||||
|
if (first_multiple > outer_dim_left_index) {
|
||||||
|
const Index head_size = first_multiple - outer_dim_left_index;
|
||||||
|
input_block_sizes[outer_dim] = head_size;
|
||||||
|
broadcast_block_sizes[copy_outer_dim] = head_size;
|
||||||
|
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||||
|
broadcast_block_strides[copy_outer_dim] =
|
||||||
|
output_block_strides[outer_dim];
|
||||||
|
broadcast_block_sizes[broadcast_outer_dim] = 1;
|
||||||
|
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||||
|
broadcast_block_strides[broadcast_outer_dim] =
|
||||||
|
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||||
|
output_block);
|
||||||
|
}
|
||||||
|
if (first_multiple < last_multiple) {
|
||||||
|
input_block_sizes[outer_dim] = input_outer_dim_size;
|
||||||
|
broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
|
||||||
|
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||||
|
broadcast_block_strides[copy_outer_dim] =
|
||||||
|
output_block_strides[outer_dim];
|
||||||
|
broadcast_block_sizes[broadcast_outer_dim] =
|
||||||
|
(last_multiple - first_multiple) / input_outer_dim_size;
|
||||||
|
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||||
|
broadcast_block_strides[broadcast_outer_dim] =
|
||||||
|
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||||
|
const Index offset = (first_multiple - outer_dim_left_index) *
|
||||||
|
m_outputStrides[outer_dim];
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides,
|
||||||
|
offset, output_block);
|
||||||
|
}
|
||||||
|
if (last_multiple < outer_dim_left_index + outer_dim_size) {
|
||||||
|
const Index tail_size =
|
||||||
|
outer_dim_left_index + outer_dim_size - last_multiple;
|
||||||
|
input_block_sizes[outer_dim] = tail_size;
|
||||||
|
broadcast_block_sizes[copy_outer_dim] = tail_size;
|
||||||
|
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||||
|
broadcast_block_strides[copy_outer_dim] =
|
||||||
|
output_block_strides[outer_dim];
|
||||||
|
broadcast_block_sizes[broadcast_outer_dim] = 1;
|
||||||
|
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||||
|
broadcast_block_strides[broadcast_outer_dim] =
|
||||||
|
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||||
|
const Index offset = (last_multiple - outer_dim_left_index) *
|
||||||
|
m_outputStrides[outer_dim];
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides,
|
||||||
|
offset, output_block);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// b and c do not exist.
|
||||||
|
const int copy_outer_dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 2 * outer_dim_start
|
||||||
|
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||||
|
input_block_sizes[outer_dim] = outer_dim_size;
|
||||||
|
broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
|
||||||
|
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||||
|
broadcast_block_strides[copy_outer_dim] =
|
||||||
|
output_block_strides[outer_dim];
|
||||||
|
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||||
|
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||||
|
output_block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
||||||
|
|
||||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||||
|
|
||||||
Broadcast functor() const { return m_broadcast; }
|
Broadcast functor() const { return m_broadcast; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
|
||||||
|
const Dimensions& input_block_sizes,
|
||||||
|
const BroadcastDimensions& broadcast_block_sizes,
|
||||||
|
const BroadcastDimensions& broadcast_block_strides,
|
||||||
|
const BroadcastDimensions& broadcast_tensor_strides, Index offset,
|
||||||
|
TensorBlock* output_block) const {
|
||||||
|
TensorBlock input_view_block(
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? indexColMajor(output_block->first_coeff_index() + offset)
|
||||||
|
: indexRowMajor(output_block->first_coeff_index() + offset),
|
||||||
|
input_block_sizes, Dimensions(m_inputStrides),
|
||||||
|
Dimensions(m_inputStrides), NULL);
|
||||||
|
|
||||||
|
internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
|
||||||
|
input_view_block);
|
||||||
|
BroadcastTensorBlock broadcast_block(
|
||||||
|
0, broadcast_block_sizes, broadcast_block_strides,
|
||||||
|
broadcast_tensor_strides, output_block->data() + offset);
|
||||||
|
|
||||||
|
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
const Device& m_device;
|
||||||
const Broadcast m_broadcast;
|
const Broadcast m_broadcast;
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
array<Index, NumDims> m_outputStrides;
|
array<Index, NumDims> m_outputStrides;
|
||||||
|
@ -144,14 +144,19 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
// Alignment can't be guaranteed at compile time since it depends on the
|
// Alignment can't be guaranteed at compile time since it depends on the
|
||||||
// slice offsets.
|
// slice offsets.
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
|
||||||
|
using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
|
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
|
||||||
{
|
{
|
||||||
@ -184,6 +189,23 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
m_inputStride *= input_dims[m_dim.actualDim()];
|
m_inputStride *= input_dims[m_dim.actualDim()];
|
||||||
m_inputOffset = m_stride * op.offset();
|
m_inputOffset = m_stride * op.offset();
|
||||||
|
|
||||||
|
if (BlockAccess) {
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
m_inputStrides[0] = 1;
|
||||||
|
for (int i = 1; i < NumInputDims; ++i) {
|
||||||
|
m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
m_inputStrides[NumInputDims - 1] = 1;
|
||||||
|
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||||
|
m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_block_total_size_max =
|
||||||
|
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
@ -266,6 +288,60 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
TensorOpCost(0, 0, cost, vectorized, PacketSize);
|
TensorOpCost(0, 0, cost, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
m_block_total_size_max));
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(andydavis) Reduce the overhead of this function (experiment with
|
||||||
|
// using a fixed block size).
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
OutputTensorBlock* output_block) const {
|
||||||
|
// Calculate input block sizes.
|
||||||
|
const DSizes<Index, NumDims>& output_block_sizes =
|
||||||
|
output_block->block_sizes();
|
||||||
|
const DSizes<Index, NumDims>& output_block_strides =
|
||||||
|
output_block->block_strides();
|
||||||
|
const Index chip_dim = m_dim.actualDim();
|
||||||
|
DSizes<Index, NumInputDims> input_block_sizes;
|
||||||
|
DSizes<Index, NumInputDims> input_block_strides;
|
||||||
|
for (Index i = 0; i < NumInputDims; ++i) {
|
||||||
|
if (i < chip_dim) {
|
||||||
|
input_block_sizes[i] = output_block_sizes[i];
|
||||||
|
input_block_strides[i] = output_block_strides[i];
|
||||||
|
} else if (i > chip_dim) {
|
||||||
|
input_block_sizes[i] = output_block_sizes[i - 1];
|
||||||
|
input_block_strides[i] = output_block_strides[i - 1];
|
||||||
|
} else {
|
||||||
|
input_block_sizes[i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fix up input_block_stride for chip dimension.
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
if (chip_dim == 0) {
|
||||||
|
input_block_strides[chip_dim] = 1;
|
||||||
|
} else {
|
||||||
|
input_block_strides[chip_dim] =
|
||||||
|
input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (chip_dim == NumInputDims - 1) {
|
||||||
|
input_block_strides[chip_dim] = 1;
|
||||||
|
} else {
|
||||||
|
input_block_strides[chip_dim] =
|
||||||
|
input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Instantiate and read input block from input tensor.
|
||||||
|
InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||||
|
input_block_sizes, input_block_strides,
|
||||||
|
m_inputStrides, output_block->data());
|
||||||
|
m_impl.block(&input_block);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
||||||
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
|
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
|
||||||
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
|
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
|
||||||
@ -316,6 +392,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
Index m_stride;
|
Index m_stride;
|
||||||
Index m_inputOffset;
|
Index m_inputOffset;
|
||||||
Index m_inputStride;
|
Index m_inputStride;
|
||||||
|
Index m_block_total_size_max;
|
||||||
|
DSizes<Index, NumInputDims> m_inputStrides;
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
const internal::DimensionId<DimId> m_dim;
|
const internal::DimensionId<DimId> m_dim;
|
||||||
const Device& m_device;
|
const Device& m_device;
|
||||||
@ -342,12 +420,18 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
RawAccess = false
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
|
||||||
|
using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
@ -395,6 +479,50 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||||
|
const OutputTensorBlock& output_block) {
|
||||||
|
// Calculate input block sizes.
|
||||||
|
const DSizes<Index, NumDims>& output_block_sizes =
|
||||||
|
output_block.block_sizes();
|
||||||
|
const DSizes<Index, NumDims>& output_block_strides =
|
||||||
|
output_block.block_strides();
|
||||||
|
const Index chip_dim = this->m_dim.actualDim();
|
||||||
|
DSizes<Index, NumInputDims> input_block_sizes;
|
||||||
|
DSizes<Index, NumInputDims> input_block_strides;
|
||||||
|
for (Index i = 0; i < NumInputDims; ++i) {
|
||||||
|
if (i < chip_dim) {
|
||||||
|
input_block_sizes[i] = output_block_sizes[i];
|
||||||
|
input_block_strides[i] = output_block_strides[i];
|
||||||
|
} else if (i > chip_dim) {
|
||||||
|
input_block_sizes[i] = output_block_sizes[i - 1];
|
||||||
|
input_block_strides[i] = output_block_strides[i - 1];
|
||||||
|
} else {
|
||||||
|
input_block_sizes[i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fix up input_block_stride for chip dimension.
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
if (chip_dim == 0) {
|
||||||
|
input_block_strides[chip_dim] = 1;
|
||||||
|
} else {
|
||||||
|
input_block_strides[chip_dim] =
|
||||||
|
input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (chip_dim == NumInputDims - 1) {
|
||||||
|
input_block_strides[chip_dim] = 1;
|
||||||
|
} else {
|
||||||
|
input_block_strides[chip_dim] =
|
||||||
|
input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Write input block.
|
||||||
|
this->m_impl.writeBlock(InputTensorBlock(
|
||||||
|
this->srcCoeff(output_block.first_coeff_index()), input_block_sizes,
|
||||||
|
input_block_strides, this->m_inputStrides,
|
||||||
|
const_cast<ScalarNoConst*>(output_block.data())));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,6 +54,66 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
|
|||||||
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
|
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename Self, bool Vectorizable>
|
||||||
|
struct ImagePatchCopyOp {
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::Scalar Scalar;
|
||||||
|
typedef typename Self::Impl Impl;
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const Self& self, const Index num_coeff_to_copy, const Index dst_index,
|
||||||
|
Scalar* dst_data, const Index src_index) {
|
||||||
|
const Impl& impl = self.impl();
|
||||||
|
for (Index i = 0; i < num_coeff_to_copy; ++i) {
|
||||||
|
dst_data[dst_index + i] = impl.coeff(src_index + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Self>
|
||||||
|
struct ImagePatchCopyOp<Self, true> {
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::Scalar Scalar;
|
||||||
|
typedef typename Self::Impl Impl;
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const Self& self, const Index num_coeff_to_copy, const Index dst_index,
|
||||||
|
Scalar* dst_data, const Index src_index) {
|
||||||
|
const Impl& impl = self.impl();
|
||||||
|
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
||||||
|
const Index vectorized_size =
|
||||||
|
(num_coeff_to_copy / packet_size) * packet_size;
|
||||||
|
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
||||||
|
Packet p = impl.template packet<Unaligned>(src_index + i);
|
||||||
|
internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
|
||||||
|
}
|
||||||
|
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
|
dst_data[dst_index + i] = impl.coeff(src_index + i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Self>
|
||||||
|
struct ImagePatchPaddingOp {
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::Scalar Scalar;
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const Index num_coeff_to_pad, const Scalar padding_value,
|
||||||
|
const Index dst_index, Scalar* dst_data) {
|
||||||
|
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
||||||
|
const Packet padded_packet = internal::pset1<Packet>(padding_value);
|
||||||
|
const Index vectorized_size =
|
||||||
|
(num_coeff_to_pad / packet_size) * packet_size;
|
||||||
|
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
||||||
|
internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
|
||||||
|
padded_packet);
|
||||||
|
}
|
||||||
|
for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
|
||||||
|
dst_data[dst_index + i] = padding_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||||
@ -184,15 +244,17 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __SYCL_DEVICE_ONLY__
|
using OutputTensorBlock = internal::TensorBlock<Scalar, Index, NumDims, Layout>;
|
||||||
|
|
||||||
|
#ifdef __SYCL_DEVICE_ONLY__
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
|
||||||
#else
|
#else
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
|
||||||
@ -342,6 +404,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
} else {
|
} else {
|
||||||
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
|
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_block_total_size_max =
|
||||||
|
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
@ -484,6 +549,146 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
m_block_total_size_max));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
OutputTensorBlock* output_block) const {
|
||||||
|
using ImagePatchCopyOp = internal::ImagePatchCopyOp<Self, PacketAccess>;
|
||||||
|
using ImagePatchPaddingOp = internal::ImagePatchPaddingOp<Self>;
|
||||||
|
|
||||||
|
// Calculate loop limits and various input/output dim sizes.
|
||||||
|
const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
|
||||||
|
const bool col_major =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||||
|
const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
|
||||||
|
const Index output_depth_dim_size =
|
||||||
|
m_dimensions[col_major ? 0 : NumDims - 1];
|
||||||
|
const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
|
||||||
|
const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
|
||||||
|
const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
|
||||||
|
const Index block_col_stride = row_dim_size * depth_dim_size;
|
||||||
|
const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
|
||||||
|
const Index outer_dim_size =
|
||||||
|
block_sizes.TotalSize() /
|
||||||
|
(depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
|
||||||
|
|
||||||
|
const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
|
||||||
|
const Index batch_size = patch_size * patch_index_dim_size;
|
||||||
|
|
||||||
|
Index output_index = output_block->first_coeff_index();
|
||||||
|
|
||||||
|
// Loop through outer dimensions.
|
||||||
|
for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
|
||||||
|
++outer_dim_index) {
|
||||||
|
const Index outer_output_base_index = outer_dim_index * batch_size;
|
||||||
|
// Find the offset of the element wrt the location of the first element.
|
||||||
|
const Index patchIndexStart = output_index / m_fastPatchStride;
|
||||||
|
const Index patchOffset =
|
||||||
|
(output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
|
||||||
|
const Index colOffsetStart = patchOffset / m_fastColStride;
|
||||||
|
// Other ways to index this element.
|
||||||
|
const Index otherIndex =
|
||||||
|
(NumDims == 4) ? 0 : output_index / m_fastOtherStride;
|
||||||
|
const Index patch2DIndexStart =
|
||||||
|
(NumDims == 4)
|
||||||
|
? 0
|
||||||
|
: (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||||
|
// Calculate starting depth index.
|
||||||
|
const Index depth = output_index - (output_index / m_fastOutputDepth) *
|
||||||
|
output_depth_dim_size;
|
||||||
|
const Index patch_input_base_index =
|
||||||
|
depth + otherIndex * m_patchInputStride;
|
||||||
|
|
||||||
|
// Loop through patches.
|
||||||
|
for (Index patch_index_dim_index = 0;
|
||||||
|
patch_index_dim_index < patch_index_dim_size;
|
||||||
|
++patch_index_dim_index) {
|
||||||
|
const Index patch_output_base_index =
|
||||||
|
outer_output_base_index + patch_index_dim_index * patch_size;
|
||||||
|
// Patch index corresponding to the passed in index.
|
||||||
|
const Index patchIndex = patchIndexStart + patch_index_dim_index;
|
||||||
|
const Index patch2DIndex =
|
||||||
|
(NumDims == 4) ? patchIndex
|
||||||
|
: patch2DIndexStart + patch_index_dim_index;
|
||||||
|
const Index colIndex = patch2DIndex / m_fastOutputRows;
|
||||||
|
const Index input_col_base = colIndex * m_col_strides;
|
||||||
|
const Index row_offset_base =
|
||||||
|
(patch2DIndex - colIndex * m_outputRows) * m_row_strides -
|
||||||
|
m_rowPaddingTop;
|
||||||
|
|
||||||
|
// Loop through columns.
|
||||||
|
for (Index col_dim_index = 0; col_dim_index < col_dim_size;
|
||||||
|
++col_dim_index) {
|
||||||
|
const Index col_output_base_index =
|
||||||
|
patch_output_base_index + col_dim_index * block_col_stride;
|
||||||
|
|
||||||
|
// Calculate col index in the input original tensor.
|
||||||
|
Index colOffset = colOffsetStart + col_dim_index;
|
||||||
|
Index inputCol =
|
||||||
|
input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
|
||||||
|
Index origInputCol =
|
||||||
|
(m_col_inflate_strides == 1)
|
||||||
|
? inputCol
|
||||||
|
: ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
|
||||||
|
|
||||||
|
bool pad_column = false;
|
||||||
|
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
|
||||||
|
((m_col_inflate_strides != 1) &&
|
||||||
|
(inputCol != origInputCol * m_col_inflate_strides))) {
|
||||||
|
pad_column = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Index col_input_base_index =
|
||||||
|
patch_input_base_index + origInputCol * m_colInputStride;
|
||||||
|
const Index input_row_base =
|
||||||
|
row_offset_base +
|
||||||
|
((patchOffset + col_dim_index * output_row_dim_size) -
|
||||||
|
colOffset * m_colStride) *
|
||||||
|
m_in_row_strides;
|
||||||
|
// Loop through rows.
|
||||||
|
for (Index row_dim_index = 0; row_dim_index < row_dim_size;
|
||||||
|
++row_dim_index) {
|
||||||
|
const Index output_base_index =
|
||||||
|
col_output_base_index + row_dim_index * depth_dim_size;
|
||||||
|
bool pad_row = false;
|
||||||
|
Index inputIndex;
|
||||||
|
if (!pad_column) {
|
||||||
|
Index inputRow =
|
||||||
|
input_row_base + row_dim_index * m_in_row_strides;
|
||||||
|
Index origInputRow =
|
||||||
|
(m_row_inflate_strides == 1)
|
||||||
|
? inputRow
|
||||||
|
: ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
|
||||||
|
: 0);
|
||||||
|
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
|
||||||
|
((m_row_inflate_strides != 1) &&
|
||||||
|
(inputRow != origInputRow * m_row_inflate_strides))) {
|
||||||
|
pad_row = true;
|
||||||
|
} else {
|
||||||
|
inputIndex =
|
||||||
|
col_input_base_index + origInputRow * m_rowInputStride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Copy (or pad) along depth dimension.
|
||||||
|
if (pad_column || pad_row) {
|
||||||
|
ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
|
||||||
|
output_base_index, output_block->data());
|
||||||
|
} else {
|
||||||
|
ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
|
||||||
|
output_block->data(), inputIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output_index += m_otherStride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||||
{
|
{
|
||||||
@ -538,6 +743,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
internal::TensorIntDivisor<Index> m_fastOutputDepth;
|
internal::TensorIntDivisor<Index> m_fastOutputDepth;
|
||||||
|
|
||||||
Scalar m_paddingValue;
|
Scalar m_paddingValue;
|
||||||
|
Index m_block_total_size_max;
|
||||||
|
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
#ifdef EIGEN_USE_SYCL
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
@ -102,27 +102,64 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
|
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
|
||||||
typedef NewDimensions Dimensions;
|
typedef NewDimensions Dimensions;
|
||||||
|
|
||||||
|
typedef typename XprType::Index Index;
|
||||||
|
typedef typename XprType::Scalar Scalar;
|
||||||
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
|
|
||||||
|
static const int NumOutputDims = internal::array_size<Dimensions>::value;
|
||||||
|
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
// TODO(andydavis, wuke) Enable BlockAccess for the general case when the
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
// performance issue with block-based reshape is resolved.
|
||||||
CoordAccess = false, // to be implemented
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
|
||||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
TensorEvaluator<ArgType, Device>::RawAccess &&
|
||||||
|
NumInputDims > 0 && NumOutputDims > 0,
|
||||||
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
|
CoordAccess = false, // to be implemented
|
||||||
|
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
|
||||||
|
using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
|
||||||
|
using OutputTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
|
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
|
||||||
{
|
{
|
||||||
// The total size of the reshaped tensor must be equal to the total size
|
// The total size of the reshaped tensor must be equal to the total size
|
||||||
// of the input tensor.
|
// of the input tensor.
|
||||||
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
|
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
|
||||||
}
|
|
||||||
|
|
||||||
typedef typename XprType::Index Index;
|
if (BlockAccess) {
|
||||||
typedef typename XprType::Scalar Scalar;
|
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
m_impl.dimensions();
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
m_outputStrides[0] = 1;
|
||||||
|
for (int i = 1; i < NumOutputDims; ++i) {
|
||||||
|
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||||
|
}
|
||||||
|
m_inputStrides[0] = 1;
|
||||||
|
for (int i = 1; i < NumInputDims; ++i) {
|
||||||
|
m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
m_outputStrides[NumOutputDims - 1] = 1;
|
||||||
|
for (int i = NumOutputDims - 2; i >= 0; --i) {
|
||||||
|
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||||
|
}
|
||||||
|
m_inputStrides[NumInputDims - 1] = 1;
|
||||||
|
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||||
|
m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
@ -148,6 +185,140 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
return m_impl.costPerCoeff(vectorized);
|
return m_impl.costPerCoeff(vectorized);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(andydavis) Reduce the overhead of this function.
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
OutputTensorBlock* output_block) const {
|
||||||
|
if (m_impl.data() != NULL) {
|
||||||
|
OutputTensorBlockReader::Run(output_block, m_impl.data());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate output block unit-stride inner dimension length.
|
||||||
|
const DSizes<Index, NumOutputDims>& output_block_sizes =
|
||||||
|
output_block->block_sizes();
|
||||||
|
Index output_inner_dim_size = 1;
|
||||||
|
Index output_outer_dim_start = NumOutputDims;
|
||||||
|
for (Index i = 0; i < NumOutputDims; ++i) {
|
||||||
|
const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i : NumOutputDims - i - 1;
|
||||||
|
output_inner_dim_size *= output_block_sizes[dim];
|
||||||
|
if (output_block_sizes[dim] < m_dimensions[dim]) {
|
||||||
|
output_outer_dim_start = i + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize output block iterator state.
|
||||||
|
struct BlockIteratorState {
|
||||||
|
Index stride;
|
||||||
|
Index span;
|
||||||
|
Index size;
|
||||||
|
Index count;
|
||||||
|
};
|
||||||
|
array<BlockIteratorState, NumOutputDims> block_iter_state;
|
||||||
|
|
||||||
|
for (Index i = 0; i < NumOutputDims; ++i) {
|
||||||
|
const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i : NumOutputDims - i - 1;
|
||||||
|
block_iter_state[i].size = output_block_sizes[dim];
|
||||||
|
block_iter_state[i].stride = m_outputStrides[dim];
|
||||||
|
block_iter_state[i].span =
|
||||||
|
block_iter_state[i].stride * (block_iter_state[i].size - 1);
|
||||||
|
block_iter_state[i].count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Index output_outer_dim_size = output_block_sizes.TotalSize() /
|
||||||
|
output_inner_dim_size;
|
||||||
|
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
|
||||||
|
m_impl.dimensions();
|
||||||
|
|
||||||
|
Index index = output_block->first_coeff_index();
|
||||||
|
for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
|
||||||
|
Index inner_idx = 0;
|
||||||
|
while (inner_idx < output_inner_dim_size) {
|
||||||
|
// Calculate input coords based on 'index'.
|
||||||
|
array<Index, NumInputDims> input_coords;
|
||||||
|
Index idx = index;
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
for (int i = NumInputDims - 1; i > 0; --i) {
|
||||||
|
input_coords[i] = idx / m_inputStrides[i];
|
||||||
|
idx -= input_coords[i] * m_inputStrides[i];
|
||||||
|
}
|
||||||
|
input_coords[0] = idx;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumInputDims - 1; ++i) {
|
||||||
|
input_coords[i] = idx / m_inputStrides[i];
|
||||||
|
idx -= input_coords[i] * m_inputStrides[i];
|
||||||
|
}
|
||||||
|
input_coords[NumInputDims - 1] = idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate target input block shape, using at most
|
||||||
|
// 'output_inner_dim_size' coefficients along the input block's inner
|
||||||
|
// dimensions.
|
||||||
|
DSizes<Index, NumInputDims> input_block_sizes;
|
||||||
|
Index num_to_allocate = output_inner_dim_size - inner_idx;
|
||||||
|
for (Index i = 0; i < NumInputDims; ++i) {
|
||||||
|
const Index dim =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i : NumInputDims - i - 1;
|
||||||
|
input_block_sizes[dim] = numext::mini(
|
||||||
|
num_to_allocate, (static_cast<Index>(input_dims[dim]) -
|
||||||
|
input_coords[dim]));
|
||||||
|
if (input_coords[dim] == 0) {
|
||||||
|
num_to_allocate /= input_block_sizes[dim];
|
||||||
|
} else {
|
||||||
|
num_to_allocate = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate input block strides.
|
||||||
|
DSizes<Index, NumInputDims> input_block_strides;
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
input_block_strides[0] = 1;
|
||||||
|
for (int i = 1; i < NumInputDims; ++i) {
|
||||||
|
input_block_strides[i] = input_block_strides[i - 1] *
|
||||||
|
input_block_sizes[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
input_block_strides[NumInputDims - 1] = 1;
|
||||||
|
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||||
|
input_block_strides[i] = input_block_strides[i + 1] *
|
||||||
|
input_block_sizes[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Instantiate and read input block from input tensor.
|
||||||
|
InputTensorBlock input_block(index, input_block_sizes,
|
||||||
|
input_block_strides, m_inputStrides,
|
||||||
|
output_block->data() + outer_idx *
|
||||||
|
output_inner_dim_size + inner_idx);
|
||||||
|
|
||||||
|
m_impl.block(&input_block);
|
||||||
|
|
||||||
|
const Index input_block_total_size = input_block_sizes.TotalSize();
|
||||||
|
index += input_block_total_size;
|
||||||
|
inner_idx += input_block_total_size;
|
||||||
|
}
|
||||||
|
eigen_assert(inner_idx == output_inner_dim_size);
|
||||||
|
index -= output_inner_dim_size;
|
||||||
|
// Update index.
|
||||||
|
for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
|
||||||
|
if (++block_iter_state[i].count < block_iter_state[i].size) {
|
||||||
|
index += block_iter_state[i].stride;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
block_iter_state[i].count = 0;
|
||||||
|
index -= block_iter_state[i].span;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
|
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||||
@ -155,6 +326,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
protected:
|
protected:
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
NewDimensions m_dimensions;
|
NewDimensions m_dimensions;
|
||||||
|
DSizes<Index, NumOutputDims> m_outputStrides;
|
||||||
|
DSizes<Index, NumInputDims> m_inputStrides;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -322,17 +495,27 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
||||||
static const int NumDims = internal::array_size<Sizes>::value;
|
static const int NumDims = internal::array_size<Sizes>::value;
|
||||||
|
|
||||||
|
typedef typename XprType::Index Index;
|
||||||
|
typedef typename XprType::Scalar Scalar;
|
||||||
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
|
typedef Sizes Dimensions;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
// Alignment can't be guaranteed at compile time since it depends on the
|
// Alignment can't be guaranteed at compile time since it depends on the
|
||||||
// slice offsets and sizes.
|
// slice offsets and sizes.
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
|
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
|
||||||
{
|
{
|
||||||
@ -340,6 +523,16 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
|
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_is_identity = true;
|
||||||
|
for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||||
|
eigen_assert(m_impl.dimensions()[i] >=
|
||||||
|
op.sizes()[i] + op.startIndices()[i]);
|
||||||
|
if (m_impl.dimensions()[i] != op.sizes()[i] ||
|
||||||
|
op.startIndices()[i] != 0) {
|
||||||
|
m_is_identity = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||||
const Sizes& output_dims = op.sizes();
|
const Sizes& output_dims = op.sizes();
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
@ -367,13 +560,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
typedef typename XprType::Index Index;
|
m_block_total_size_max =
|
||||||
typedef typename XprType::Scalar Scalar;
|
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
}
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
|
||||||
typedef Sizes Dimensions;
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
@ -417,7 +607,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||||
{
|
{
|
||||||
return m_impl.coeff(srcCoeff(index));
|
if (m_is_identity) {
|
||||||
|
return m_impl.coeff(index);
|
||||||
|
} else {
|
||||||
|
return m_impl.coeff(srcCoeff(index));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int LoadMode>
|
template<int LoadMode>
|
||||||
@ -427,6 +621,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
|
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
|
||||||
|
|
||||||
|
if (m_is_identity) {
|
||||||
|
return m_impl.template packet<LoadMode>(index);
|
||||||
|
}
|
||||||
|
|
||||||
Index inputIndices[] = {0, 0};
|
Index inputIndices[] = {0, 0};
|
||||||
Index indices[] = {index, index + packetSize - 1};
|
Index indices[] = {index, index + packetSize - 1};
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
@ -469,9 +667,26 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
m_block_total_size_max));
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
TensorBlock* output_block) const {
|
||||||
|
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||||
|
output_block->block_sizes(),
|
||||||
|
output_block->block_strides(),
|
||||||
|
Dimensions(m_inputStrides),
|
||||||
|
output_block->data());
|
||||||
|
m_impl.block(&input_block);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
||||||
Scalar* result = m_impl.data();
|
Scalar* result = m_impl.data();
|
||||||
@ -544,7 +759,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
const Device& m_device;
|
const Device& m_device;
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
|
bool m_is_identity;
|
||||||
const StartIndices m_offsets;
|
const StartIndices m_offsets;
|
||||||
|
Index m_block_total_size_max;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -557,33 +774,46 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
||||||
static const int NumDims = internal::array_size<Sizes>::value;
|
static const int NumDims = internal::array_size<Sizes>::value;
|
||||||
|
|
||||||
enum {
|
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
|
||||||
BlockAccess = false,
|
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
|
||||||
CoordAccess = false,
|
|
||||||
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
|
|
||||||
};
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
|
||||||
: Base(op, device)
|
|
||||||
{ }
|
|
||||||
|
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
typedef typename XprType::Scalar Scalar;
|
typedef typename XprType::Scalar Scalar;
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
typedef Sizes Dimensions;
|
typedef Sizes Dimensions;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
|
CoordAccess = false,
|
||||||
|
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
|
: Base(op, device)
|
||||||
|
{ }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
if (this->m_is_identity) {
|
||||||
|
return this->m_impl.coeffRef(index);
|
||||||
|
} else {
|
||||||
|
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x)
|
||||||
{
|
{
|
||||||
|
if (this->m_is_identity) {
|
||||||
|
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
Index inputIndices[] = {0, 0};
|
Index inputIndices[] = {0, 0};
|
||||||
Index indices[] = {index, index + packetSize - 1};
|
Index indices[] = {index, index + packetSize - 1};
|
||||||
@ -623,6 +853,14 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||||
|
const TensorBlock& block) {
|
||||||
|
this->m_impl.writeBlock(TensorBlock(
|
||||||
|
this->srcCoeff(block.first_coeff_index()), block.block_sizes(),
|
||||||
|
block.block_strides(), Dimensions(this->m_inputStrides),
|
||||||
|
const_cast<ScalarNoConst*>(block.data())));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -739,7 +977,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
|||||||
{
|
{
|
||||||
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
|
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
|
||||||
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
|
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
|
||||||
|
m_is_identity = true;
|
||||||
for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||||
|
if (m_strides[i] != 1 || op.startIndices()[i] != 0 ||
|
||||||
|
op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) {
|
||||||
|
m_is_identity = false;
|
||||||
|
}
|
||||||
|
|
||||||
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
|
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
|
||||||
if(m_strides[i]>0){
|
if(m_strides[i]>0){
|
||||||
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
|
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
|
||||||
@ -822,11 +1066,15 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||||
{
|
{
|
||||||
return m_impl.coeff(srcCoeff(index));
|
if (m_is_identity) {
|
||||||
|
return m_impl.coeff(index);
|
||||||
|
} else {
|
||||||
|
return m_impl.coeff(srcCoeff(index));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
|
||||||
@ -873,6 +1121,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
|||||||
array<Index, NumDims> m_outputStrides;
|
array<Index, NumDims> m_outputStrides;
|
||||||
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||||
array<Index, NumDims> m_inputStrides;
|
array<Index, NumDims> m_inputStrides;
|
||||||
|
bool m_is_identity;
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
const Device& m_device;
|
const Device& m_device;
|
||||||
DSizes<Index, NumDims> m_startIndices; // clamped startIndices
|
DSizes<Index, NumDims> m_startIndices; // clamped startIndices
|
||||||
@ -916,7 +1165,11 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
if (this->m_is_identity) {
|
||||||
|
return this->m_impl.coeffRef(index);
|
||||||
|
} else {
|
||||||
|
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -356,6 +356,70 @@ template <int NPT, typename S, typename R, typename I>
|
|||||||
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template <typename Self, typename Op,
|
||||||
|
bool Vectorizable =
|
||||||
|
(Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
|
||||||
|
class BlockReducer {
|
||||||
|
public:
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::Scalar Scalar;
|
||||||
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename Self::PacketReturnType PacketReturnType;
|
||||||
|
explicit BlockReducer(const Op& reducer) : op_(reducer) {
|
||||||
|
accum_ = op_.initialize();
|
||||||
|
}
|
||||||
|
void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
|
||||||
|
for (Index i = 0; i < num_values_to_reduce; ++i) {
|
||||||
|
op_.reduce(data[index + i], &accum_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CoeffReturnType Finalize() { return op_.finalize(accum_); }
|
||||||
|
PacketReturnType FinalizePacket() {
|
||||||
|
// TODO(andydavis) This function should not be called for Scalar
|
||||||
|
// reductions: clean this up or add an assert here.
|
||||||
|
return PacketReturnType();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
CoeffReturnType accum_;
|
||||||
|
Op op_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Self, typename Op>
|
||||||
|
class BlockReducer<Self, Op, true> {
|
||||||
|
public:
|
||||||
|
typedef typename Self::Index Index;
|
||||||
|
typedef typename Self::Scalar Scalar;
|
||||||
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename Self::PacketReturnType PacketReturnType;
|
||||||
|
static const Index PacketSize =
|
||||||
|
internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
|
explicit BlockReducer(const Op& reducer) : op_(reducer) {
|
||||||
|
vaccum_ = op_.template initializePacket<PacketReturnType>();
|
||||||
|
accum_ = op_.initialize();
|
||||||
|
}
|
||||||
|
void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
|
||||||
|
const Index vectorized_size =
|
||||||
|
(num_values_to_reduce / PacketSize) * PacketSize;
|
||||||
|
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
|
op_.reducePacket(
|
||||||
|
internal::ploadt<PacketReturnType, Unaligned>(&data[index + i]),
|
||||||
|
&vaccum_);
|
||||||
|
}
|
||||||
|
for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
|
||||||
|
op_.reduce(data[index + i], &accum_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); }
|
||||||
|
PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
PacketReturnType vaccum_;
|
||||||
|
CoeffReturnType accum_;
|
||||||
|
Op op_;
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
|
||||||
@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
|
|||||||
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
||||||
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
|
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
|
||||||
{
|
{
|
||||||
|
typedef internal::reducer_traits<Op, Device> ReducerTraits;
|
||||||
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
|
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
typedef ArgType ChildType;
|
typedef ArgType ChildType;
|
||||||
@ -410,14 +475,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
|
||||||
|
using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
|
||||||
|
|
||||||
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||||
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||||
static const bool RunningFullReduction = (NumOutputDims==0);
|
static const bool RunningFullReduction = (NumOutputDims==0);
|
||||||
@ -451,11 +521,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
m_outputStrides[0] = 1;
|
m_outputStrides[0] = 1;
|
||||||
for (int i = 1; i < NumOutputDims; ++i) {
|
for (int i = 1; i < NumOutputDims; ++i) {
|
||||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
m_outputStrides.back() = 1;
|
m_outputStrides[NumOutputDims - 1] = 1;
|
||||||
for (int i = NumOutputDims - 2; i >= 0; --i) {
|
for (int i = NumOutputDims - 2; i >= 0; --i) {
|
||||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -483,6 +555,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
++reduceIndex;
|
++reduceIndex;
|
||||||
} else {
|
} else {
|
||||||
m_preservedStrides[outputIndex] = input_strides[i];
|
m_preservedStrides[outputIndex] = input_strides[i];
|
||||||
|
m_output_to_input_dim_map[outputIndex] = i;
|
||||||
++outputIndex;
|
++outputIndex;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -492,6 +565,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
if (NumOutputDims == 0) {
|
if (NumOutputDims == 0) {
|
||||||
m_preservedStrides[0] = internal::array_prod(input_dims);
|
m_preservedStrides[0] = internal::array_prod(input_dims);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_numValuesToReduce =
|
||||||
|
NumOutputDims == 0
|
||||||
|
? internal::array_prod(input_dims)
|
||||||
|
: (static_cast<int>(Layout) == static_cast<int>(ColMajor))
|
||||||
|
? m_preservedStrides[0]
|
||||||
|
: m_preservedStrides[NumOutputDims - 1];
|
||||||
|
|
||||||
|
m_block_total_size_max =
|
||||||
|
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
@ -686,6 +769,265 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kSkewedInnerDims,
|
||||||
|
m_block_total_size_max));
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
|
||||||
|
OutputTensorBlock* output_block) const {
|
||||||
|
// Special case full reductions to avoid input block copy below.
|
||||||
|
if (NumInputDims == NumReducedDims) {
|
||||||
|
eigen_assert(output_block->first_coeff_index() == 0);
|
||||||
|
eigen_assert(output_block->block_sizes().TotalSize() == 1);
|
||||||
|
Op reducer(m_reducer);
|
||||||
|
output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
|
||||||
|
*this, 0, m_numValuesToReduce, reducer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate input tensor 'slice' required to reduce output block coeffs.
|
||||||
|
DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
|
||||||
|
for (int i = 0; i < NumOutputDims; ++i) {
|
||||||
|
// Clip preserved input dimensions by output block size.
|
||||||
|
input_slice_sizes[m_output_to_input_dim_map[i]] =
|
||||||
|
output_block->block_sizes()[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shard input tensor slice into blocks (because it could be large if we
|
||||||
|
// need to reduce along several dimensions to calculate required output
|
||||||
|
// coefficients).
|
||||||
|
const Index max_coeff_count =
|
||||||
|
numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
|
||||||
|
input_slice_sizes.TotalSize());
|
||||||
|
|
||||||
|
// Calculate max output shard size needed to keep working set of reducers
|
||||||
|
// in L1, while leaving enough space for reducer overhead and 'PacketSize'
|
||||||
|
// reductions.
|
||||||
|
DSizes<Index, NumInputDims> target_input_block_sizes;
|
||||||
|
CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
|
||||||
|
&target_input_block_sizes);
|
||||||
|
// Calculate indices for first preserved dimension.
|
||||||
|
const Index first_preserved_dim_output_index =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 0
|
||||||
|
: NumOutputDims - 1;
|
||||||
|
const Index first_preserved_dim_input_index =
|
||||||
|
m_output_to_input_dim_map[first_preserved_dim_output_index];
|
||||||
|
const bool inner_most_dim_preserved =
|
||||||
|
first_preserved_dim_input_index ==
|
||||||
|
(static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? 0
|
||||||
|
: NumInputDims - 1) |
|
||||||
|
PreservingInnerMostDims;
|
||||||
|
|
||||||
|
// Calculate output block inner/outer dimension sizes.
|
||||||
|
const Index output_block_inner_dim_size =
|
||||||
|
output_block->block_sizes()[first_preserved_dim_output_index];
|
||||||
|
const Index output_block_outer_dim_size =
|
||||||
|
output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
|
||||||
|
// Calculate shard size for first preserved dimension.
|
||||||
|
const Index output_shard_size =
|
||||||
|
target_input_block_sizes[first_preserved_dim_input_index];
|
||||||
|
const Index num_output_shards =
|
||||||
|
(output_block_inner_dim_size + output_shard_size - 1) /
|
||||||
|
output_shard_size;
|
||||||
|
|
||||||
|
// Initialize 'tensor_slice_offsets' from input coords of output index.
|
||||||
|
DSizes<Index, NumInputDims> tensor_slice_offsets;
|
||||||
|
GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
|
||||||
|
&tensor_slice_offsets);
|
||||||
|
|
||||||
|
// Store tensor slice offset in first preserved dimension to be used
|
||||||
|
// to update tensor slice extents in loop below.
|
||||||
|
const Index first_preserved_dim_offset_start =
|
||||||
|
tensor_slice_offsets[first_preserved_dim_input_index];
|
||||||
|
|
||||||
|
array<BlockIteratorState, NumOutputDims> block_iter_state;
|
||||||
|
|
||||||
|
// Initialize state used to iterate through output coefficients
|
||||||
|
// and update 'tensor_slice_offsets' in outer preserved dims.
|
||||||
|
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||||
|
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i + 1
|
||||||
|
: NumOutputDims - i - 2;
|
||||||
|
block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
|
||||||
|
block_iter_state[i].output_size = output_block->block_sizes()[dim];
|
||||||
|
block_iter_state[i].output_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate input block memory.
|
||||||
|
ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
|
||||||
|
m_device.allocate(max_coeff_count * sizeof(Scalar)));
|
||||||
|
// Allocate reducer memory.
|
||||||
|
const bool packet_reductions_enabled =
|
||||||
|
(Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
|
||||||
|
const Index num_reducers =
|
||||||
|
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||||
|
? (output_shard_size / PacketSize + output_shard_size % PacketSize +
|
||||||
|
PacketSize)
|
||||||
|
: output_shard_size;
|
||||||
|
typedef internal::BlockReducer<Self, Op> BlockReducer;
|
||||||
|
BlockReducer* reducers = static_cast<BlockReducer*>(
|
||||||
|
m_device.allocate(num_reducers * sizeof(BlockReducer)));
|
||||||
|
|
||||||
|
InputDimensions input_tensor_dims(m_impl.dimensions());
|
||||||
|
for (Index output_outer_index = 0;
|
||||||
|
output_outer_index < output_block_outer_dim_size;
|
||||||
|
++output_outer_index) {
|
||||||
|
for (Index output_shard_index = 0; output_shard_index < num_output_shards;
|
||||||
|
++output_shard_index) {
|
||||||
|
// Initialize 'tensor_slice_extents' for this output shard.
|
||||||
|
DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
|
||||||
|
for (int i = 0; i < NumInputDims; ++i) {
|
||||||
|
if (i == first_preserved_dim_input_index) {
|
||||||
|
// Clip first preserved dim size to output shard size.
|
||||||
|
tensor_slice_extents[i] = numext::mini(
|
||||||
|
output_shard_size,
|
||||||
|
input_slice_sizes[i] - (tensor_slice_offsets[i] -
|
||||||
|
first_preserved_dim_offset_start));
|
||||||
|
|
||||||
|
} else if (!m_reduced[i]) {
|
||||||
|
// Clip outer preserved dims to size 1, so that we reduce a
|
||||||
|
// contiguous set of output coefficients.
|
||||||
|
tensor_slice_extents[i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intialize output coefficient reducers.
|
||||||
|
for (int i = 0; i < num_reducers; ++i) {
|
||||||
|
new (&reducers[i]) BlockReducer(m_reducer);
|
||||||
|
}
|
||||||
|
|
||||||
|
using TensorSliceBlockMapper =
|
||||||
|
internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
|
||||||
|
Layout>;
|
||||||
|
|
||||||
|
// TODO(andydavis) Consider removing 'input_block_stride_order' if we
|
||||||
|
// find that scattered reads are not worth supporting in
|
||||||
|
// TensorSliceBlockMapper.
|
||||||
|
TensorSliceBlockMapper block_mapper(
|
||||||
|
input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
|
||||||
|
target_input_block_sizes, DimensionList<Index, NumInputDims>());
|
||||||
|
|
||||||
|
const Index num_outputs_to_update =
|
||||||
|
tensor_slice_extents[first_preserved_dim_input_index];
|
||||||
|
const Index preserved_dim_vector_reducer_count =
|
||||||
|
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||||
|
? num_outputs_to_update / PacketSize
|
||||||
|
: 0;
|
||||||
|
const Index preserved_dim_vector_coeff_count =
|
||||||
|
inner_most_dim_preserved
|
||||||
|
? preserved_dim_vector_reducer_count * PacketSize
|
||||||
|
: 0;
|
||||||
|
const Index preserved_dim_reducer_limit =
|
||||||
|
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||||
|
? (preserved_dim_vector_reducer_count +
|
||||||
|
num_outputs_to_update % PacketSize)
|
||||||
|
: num_outputs_to_update;
|
||||||
|
|
||||||
|
const Index total_block_count = block_mapper.total_block_count();
|
||||||
|
for (Index b = 0; b < total_block_count; ++b) {
|
||||||
|
InputTensorBlock input_block =
|
||||||
|
block_mapper.GetBlockForIndex(b, input_block_data);
|
||||||
|
// Read.
|
||||||
|
m_impl.block(&input_block);
|
||||||
|
|
||||||
|
Index num_values_to_reduce = 1;
|
||||||
|
for (Index i = 0; i < NumInputDims; ++i) {
|
||||||
|
if (m_reduced[i]) {
|
||||||
|
num_values_to_reduce *= input_block.block_sizes()[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Reduce.
|
||||||
|
if (inner_most_dim_preserved) {
|
||||||
|
const Index input_outer_dim_size =
|
||||||
|
input_block.block_sizes().TotalSize() / num_outputs_to_update;
|
||||||
|
for (Index input_outer_dim_index = 0;
|
||||||
|
input_outer_dim_index < input_outer_dim_size;
|
||||||
|
++input_outer_dim_index) {
|
||||||
|
const Index input_outer_dim_base =
|
||||||
|
input_outer_dim_index * num_outputs_to_update;
|
||||||
|
for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
|
||||||
|
reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
|
||||||
|
PacketSize, input_block.data());
|
||||||
|
}
|
||||||
|
const Index scalar_reducer_base =
|
||||||
|
input_outer_dim_base + preserved_dim_vector_coeff_count;
|
||||||
|
for (Index i = preserved_dim_vector_reducer_count;
|
||||||
|
i < preserved_dim_reducer_limit; ++i) {
|
||||||
|
reducers[i].Reduce(scalar_reducer_base + i -
|
||||||
|
preserved_dim_vector_reducer_count,
|
||||||
|
1, input_block.data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (Index i = 0; i < num_outputs_to_update; ++i) {
|
||||||
|
reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
|
||||||
|
input_block.data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finalize all reducers for this output shard.
|
||||||
|
const Index output_base_index =
|
||||||
|
output_outer_index * output_block_inner_dim_size +
|
||||||
|
output_shard_index * output_shard_size;
|
||||||
|
if (inner_most_dim_preserved) {
|
||||||
|
EIGEN_ALIGN_MAX
|
||||||
|
typename internal::remove_const<CoeffReturnType>::type
|
||||||
|
values[PacketSize];
|
||||||
|
for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
|
||||||
|
const Index reducer_base = output_base_index + i * PacketSize;
|
||||||
|
internal::pstore<CoeffReturnType, PacketReturnType>(
|
||||||
|
values, reducers[i].FinalizePacket());
|
||||||
|
for (Index j = 0; j < PacketSize; ++j) {
|
||||||
|
output_block->data()[reducer_base + j] = values[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const Index scalar_reducer_base =
|
||||||
|
output_base_index + preserved_dim_vector_coeff_count;
|
||||||
|
|
||||||
|
for (Index i = preserved_dim_vector_reducer_count;
|
||||||
|
i < preserved_dim_reducer_limit; ++i) {
|
||||||
|
output_block->data()[scalar_reducer_base + i -
|
||||||
|
preserved_dim_vector_reducer_count] =
|
||||||
|
reducers[i].Finalize();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < num_outputs_to_update; ++i) {
|
||||||
|
output_block->data()[output_base_index + i] =
|
||||||
|
reducers[i].Finalize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update 'tensor_slice_offsets' by num outputs for this output shard.
|
||||||
|
tensor_slice_offsets[first_preserved_dim_input_index] +=
|
||||||
|
num_outputs_to_update;
|
||||||
|
}
|
||||||
|
// Update slice offset for inner preserved dim.
|
||||||
|
tensor_slice_offsets[first_preserved_dim_input_index] -=
|
||||||
|
output_block_inner_dim_size;
|
||||||
|
// Update slice offsets for remaining output dims.
|
||||||
|
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||||
|
BlockIteratorState& b = block_iter_state[i];
|
||||||
|
if (++b.output_count < b.output_size) {
|
||||||
|
++tensor_slice_offsets[b.input_dim];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
b.output_count = 0;
|
||||||
|
tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free memory.
|
||||||
|
m_device.deallocate(input_block_data);
|
||||||
|
m_device.deallocate(reducers);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
|
EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
|
||||||
|
|
||||||
#if defined(EIGEN_USE_SYCL)
|
#if defined(EIGEN_USE_SYCL)
|
||||||
@ -722,6 +1064,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
|
|
||||||
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
||||||
|
|
||||||
|
struct BlockIteratorState {
|
||||||
|
Index input_dim;
|
||||||
|
Index output_size;
|
||||||
|
Index output_count;
|
||||||
|
};
|
||||||
|
|
||||||
// Returns the Index in the input tensor of the first value that needs to be
|
// Returns the Index in the input tensor of the first value that needs to be
|
||||||
// used to compute the reduction at output index "index".
|
// used to compute the reduction at output index "index".
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
|
||||||
@ -764,16 +1112,90 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
return startInput;
|
return startInput;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
|
||||||
|
Index index,
|
||||||
|
DSizes<Index, NumInputDims>* coords) const {
|
||||||
|
for (int i = 0; i < NumInputDims; ++i) {
|
||||||
|
(*coords)[i] = 0;
|
||||||
|
}
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
for (int i = NumOutputDims - 1; i > 0; --i) {
|
||||||
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
|
(*coords)[m_output_to_input_dim_map[i]] = idx;
|
||||||
|
index -= idx * m_outputStrides[i];
|
||||||
|
}
|
||||||
|
(*coords)[m_output_to_input_dim_map[0]] = index;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||||
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
|
(*coords)[m_output_to_input_dim_map[i]] = idx;
|
||||||
|
index -= idx * m_outputStrides[i];
|
||||||
|
}
|
||||||
|
(*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
|
||||||
|
const Index max_coeff_count,
|
||||||
|
const DSizes<Index, NumInputDims>& input_slice_sizes,
|
||||||
|
DSizes<Index, NumInputDims>* target_input_block_sizes) const {
|
||||||
|
typedef typename internal::packet_traits<Scalar>::type Packet;
|
||||||
|
typedef internal::BlockReducer<Self, Op> BlockReducer;
|
||||||
|
// TODO(andydavis) Compute reducer overhead correctly for the case where
|
||||||
|
// we are preserving the inner most dimension, and a single reducer
|
||||||
|
// reduces a packet's worth of output coefficients.
|
||||||
|
const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
|
||||||
|
|
||||||
|
Index coeff_to_allocate = max_coeff_count;
|
||||||
|
bool first_preserved_dim_allocated = false;
|
||||||
|
bool first_reduced_dim_allocated = false;
|
||||||
|
for (int i = 0; i < NumInputDims; ++i) {
|
||||||
|
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||||
|
? i
|
||||||
|
: NumInputDims - i - 1;
|
||||||
|
(*target_input_block_sizes)[dim] = 1;
|
||||||
|
if (m_reduced[dim]) {
|
||||||
|
// TODO(andydavis) Consider allocating to multiple reduced dimensions.
|
||||||
|
// Watch out for cases where reduced dimensions are not contiguous,
|
||||||
|
// which induces scattered reads.
|
||||||
|
if (!first_reduced_dim_allocated) {
|
||||||
|
(*target_input_block_sizes)[dim] =
|
||||||
|
numext::mini(input_slice_sizes[dim], coeff_to_allocate);
|
||||||
|
coeff_to_allocate /= (*target_input_block_sizes)[dim];
|
||||||
|
first_reduced_dim_allocated = true;
|
||||||
|
}
|
||||||
|
} else if (!first_preserved_dim_allocated) {
|
||||||
|
// TODO(andydavis) Include output block size in this L1 working set
|
||||||
|
// calculation.
|
||||||
|
const Index allocated = max_coeff_count - coeff_to_allocate;
|
||||||
|
const Index alloc_size = numext::maxi(
|
||||||
|
static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
|
||||||
|
(*target_input_block_sizes)[dim] =
|
||||||
|
numext::mini(input_slice_sizes[dim], alloc_size);
|
||||||
|
coeff_to_allocate = numext::maxi(
|
||||||
|
static_cast<Index>(1),
|
||||||
|
coeff_to_allocate /
|
||||||
|
((*target_input_block_sizes)[dim] * reducer_overhead));
|
||||||
|
first_preserved_dim_allocated = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Bitmap indicating if an input dimension is reduced or not.
|
// Bitmap indicating if an input dimension is reduced or not.
|
||||||
array<bool, NumInputDims> m_reduced;
|
array<bool, NumInputDims> m_reduced;
|
||||||
// Dimensions of the output of the operation.
|
// Dimensions of the output of the operation.
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
// Precomputed strides for the output tensor.
|
// Precomputed strides for the output tensor.
|
||||||
array<Index, NumOutputDims> m_outputStrides;
|
array<Index, NumOutputDims> m_outputStrides;
|
||||||
|
array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
|
||||||
// Subset of strides of the input tensor for the non-reduced dimensions.
|
// Subset of strides of the input tensor for the non-reduced dimensions.
|
||||||
// Indexed by output dimensions.
|
// Indexed by output dimensions.
|
||||||
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
|
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
|
||||||
array<Index, NumPreservedStrides> m_preservedStrides;
|
array<Index, NumPreservedStrides> m_preservedStrides;
|
||||||
|
// Map from output to input dimension index.
|
||||||
|
array<Index, NumOutputDims> m_output_to_input_dim_map;
|
||||||
|
// How many values go into each reduction
|
||||||
|
Index m_numValuesToReduce;
|
||||||
|
|
||||||
// Subset of strides of the input tensor for the reduced dimensions.
|
// Subset of strides of the input tensor for the reduced dimensions.
|
||||||
// Indexed by reduced dimensions.
|
// Indexed by reduced dimensions.
|
||||||
@ -782,6 +1204,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
// Indexed by reduced dimensions.
|
// Indexed by reduced dimensions.
|
||||||
array<Index, NumReducedDims> m_reducedDims;
|
array<Index, NumReducedDims> m_reducedDims;
|
||||||
|
|
||||||
|
// Block size for tiled (aka TensorBlock) evaluation.
|
||||||
|
Index m_block_total_size_max;
|
||||||
|
|
||||||
// Evaluator for the input expression.
|
// Evaluator for the input expression.
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
|
|
||||||
|
@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
|
|||||||
template<typename Shuffle, typename ArgType, typename Device>
|
template<typename Shuffle, typename ArgType, typename Device>
|
||||||
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||||
{
|
{
|
||||||
|
typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
|
||||||
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
|
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||||
@ -110,44 +111,60 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
|
: m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
|
||||||
{
|
{
|
||||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||||
const Shuffle& shuffle = op.shufflePermutation();
|
const Shuffle& shuffle = op.shufflePermutation();
|
||||||
|
m_is_identity = true;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
m_dimensions[i] = input_dims[shuffle[i]];
|
m_dimensions[i] = input_dims[shuffle[i]];
|
||||||
|
m_inverseShuffle[shuffle[i]] = i;
|
||||||
|
if (m_is_identity && shuffle[i] != i) {
|
||||||
|
m_is_identity = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
array<Index, NumDims> inputStrides;
|
|
||||||
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
inputStrides[0] = 1;
|
m_unshuffledInputStrides[0] = 1;
|
||||||
m_outputStrides[0] = 1;
|
m_outputStrides[0] = 1;
|
||||||
|
|
||||||
for (int i = 1; i < NumDims; ++i) {
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
|
m_unshuffledInputStrides[i] =
|
||||||
|
m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
|
||||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
inputStrides[NumDims - 1] = 1;
|
m_unshuffledInputStrides[NumDims - 1] = 1;
|
||||||
m_outputStrides[NumDims - 1] = 1;
|
m_outputStrides[NumDims - 1] = 1;
|
||||||
for (int i = NumDims - 2; i >= 0; --i) {
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
|
m_unshuffledInputStrides[i] =
|
||||||
|
m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
|
||||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
m_inputStrides[i] = inputStrides[shuffle[i]];
|
m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_block_total_size_max =
|
||||||
|
numext::maxi<Index>(1, device.firstLevelCacheSize() / sizeof(Scalar));
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
@ -162,29 +179,151 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||||
{
|
{
|
||||||
return m_impl.coeff(srcCoeff(index));
|
if (m_is_identity) {
|
||||||
|
return m_impl.coeff(index);
|
||||||
|
} else {
|
||||||
|
return m_impl.coeff(srcCoeff(index));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int LoadMode, typename Self, bool ImplPacketAccess>
|
||||||
|
struct PacketLoader {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
static PacketReturnType Run(const Self& self, Index index) {
|
||||||
|
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||||
|
for (int i = 0; i < PacketSize; ++i) {
|
||||||
|
values[i] = self.coeff(index + i);
|
||||||
|
}
|
||||||
|
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||||
|
return rslt;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int LoadMode, typename Self>
|
||||||
|
struct PacketLoader<LoadMode, Self, true> {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
static PacketReturnType Run(const Self& self, Index index) {
|
||||||
|
if (self.m_is_identity) {
|
||||||
|
return self.m_impl.template packet<LoadMode>(index);
|
||||||
|
} else {
|
||||||
|
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||||
|
for (int i = 0; i < PacketSize; ++i) {
|
||||||
|
values[i] = self.coeff(index + i);
|
||||||
|
}
|
||||||
|
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||||
|
return rslt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<int LoadMode>
|
template<int LoadMode>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||||
{
|
{
|
||||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
|
||||||
|
return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
for (int i = 0; i < PacketSize; ++i) {
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
values[i] = coeff(index+i);
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::TensorBlockShapeType::kUniformAllDims,
|
||||||
|
m_block_total_size_max));
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
TensorBlock* output_block) const {
|
||||||
|
if (m_impl.data() != NULL) {
|
||||||
|
// Fast path: we have direct access to the data, so shuffle as we read.
|
||||||
|
TensorBlockReader::Run(output_block,
|
||||||
|
srcCoeff(output_block->first_coeff_index()),
|
||||||
|
m_inverseShuffle,
|
||||||
|
m_unshuffledInputStrides,
|
||||||
|
m_impl.data());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Slow path: read unshuffled block from the input and shuffle in-place.
|
||||||
|
// Initialize input block sizes using input-to-output shuffle map.
|
||||||
|
DSizes<Index, NumDims> input_block_sizes;
|
||||||
|
for (Index i = 0; i < NumDims; ++i) {
|
||||||
|
input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate input block strides.
|
||||||
|
DSizes<Index, NumDims> input_block_strides;
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
input_block_strides[0] = 1;
|
||||||
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
|
input_block_strides[i] =
|
||||||
|
input_block_strides[i - 1] * input_block_sizes[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
input_block_strides[NumDims - 1] = 1;
|
||||||
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
|
input_block_strides[i] =
|
||||||
|
input_block_strides[i + 1] * input_block_sizes[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read input block.
|
||||||
|
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||||
|
input_block_sizes,
|
||||||
|
input_block_strides,
|
||||||
|
Dimensions(m_unshuffledInputStrides),
|
||||||
|
output_block->data());
|
||||||
|
|
||||||
|
m_impl.block(&input_block);
|
||||||
|
|
||||||
|
// Naive In-place shuffle: random IO but block size is O(L1 cache size).
|
||||||
|
// TODO(andydavis) Improve the performance of this in-place shuffle.
|
||||||
|
const Index total_size = input_block_sizes.TotalSize();
|
||||||
|
std::vector<bool> bitmap(total_size, false);
|
||||||
|
ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
|
||||||
|
const DSizes<Index, NumDims>& output_block_strides =
|
||||||
|
output_block->block_strides();
|
||||||
|
for (Index input_index = 0; input_index < total_size; ++input_index) {
|
||||||
|
if (bitmap[input_index]) {
|
||||||
|
// Coefficient at this index has already been shuffled.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
|
||||||
|
output_block_strides);
|
||||||
|
if (output_index == input_index) {
|
||||||
|
// Coefficient already in place.
|
||||||
|
bitmap[output_index] = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The following loop starts at 'input_index', and shuffles
|
||||||
|
// coefficients into their shuffled location at 'output_index'.
|
||||||
|
// It skips through the array shuffling coefficients by following
|
||||||
|
// the shuffle cycle starting and ending a 'start_index'.
|
||||||
|
ScalarNoConst evicted_value;
|
||||||
|
ScalarNoConst shuffled_value = data[input_index];
|
||||||
|
do {
|
||||||
|
evicted_value = data[output_index];
|
||||||
|
data[output_index] = shuffled_value;
|
||||||
|
shuffled_value = evicted_value;
|
||||||
|
bitmap[output_index] = true;
|
||||||
|
output_index = GetBlockOutputIndex(output_index, input_block_strides,
|
||||||
|
output_block_strides);
|
||||||
|
} while (output_index != input_index);
|
||||||
|
|
||||||
|
data[output_index] = shuffled_value;
|
||||||
|
bitmap[output_index] = true;
|
||||||
}
|
}
|
||||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
|
||||||
return rslt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
|
||||||
|
NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||||
2 * TensorOpCost::MulCost<Index>() +
|
2 * TensorOpCost::MulCost<Index>() +
|
||||||
TensorOpCost::DivCost<Index>());
|
TensorOpCost::DivCost<Index>());
|
||||||
return m_impl.costPerCoeff(vectorized) +
|
return m_impl.costPerCoeff(vectorized) +
|
||||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
||||||
@ -195,27 +334,57 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
|
EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
|
||||||
|
Index input_index,
|
||||||
|
const DSizes<Index, NumDims>& input_block_strides,
|
||||||
|
const DSizes<Index, NumDims>& output_block_strides) const {
|
||||||
|
Index output_index = 0;
|
||||||
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
|
const Index idx = input_index / input_block_strides[i];
|
||||||
|
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
||||||
|
input_index -= idx * input_block_strides[i];
|
||||||
|
}
|
||||||
|
return output_index + input_index *
|
||||||
|
output_block_strides[m_inverseShuffle[0]];
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
|
const Index idx = input_index / input_block_strides[i];
|
||||||
|
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
||||||
|
input_index -= idx * input_block_strides[i];
|
||||||
|
}
|
||||||
|
return output_index + input_index *
|
||||||
|
output_block_strides[m_inverseShuffle[NumDims - 1]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
|
||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
inputIndex += idx * m_inputStrides[i];
|
inputIndex += idx * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
return inputIndex + index * m_inputStrides[0];
|
return inputIndex + index * m_inputStrides[0];
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
inputIndex += idx * m_inputStrides[i];
|
inputIndex += idx * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
return inputIndex + index * m_inputStrides[NumDims - 1];
|
return inputIndex + index * m_inputStrides[NumDims - 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
|
bool m_is_identity;
|
||||||
|
array<Index, NumDims> m_inverseShuffle;
|
||||||
array<Index, NumDims> m_outputStrides;
|
array<Index, NumDims> m_outputStrides;
|
||||||
|
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||||
array<Index, NumDims> m_inputStrides;
|
array<Index, NumDims> m_inputStrides;
|
||||||
|
array<Index, NumDims> m_unshuffledInputStrides;
|
||||||
|
Index m_block_total_size_max;
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
/// required by sycl
|
/// required by sycl
|
||||||
Shuffle m_shuffle;
|
Shuffle m_shuffle;
|
||||||
@ -239,12 +408,18 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||||
BlockAccess = false,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
RawAccess = false
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using ScalarNoConst = typename internal::remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
using TensorBlockWriter = internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
@ -265,6 +440,14 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
this->coeffRef(index+i) = values[i];
|
this->coeffRef(index+i) = values[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||||
|
const TensorBlock& block) {
|
||||||
|
eigen_assert(this->m_impl.data() != NULL);
|
||||||
|
TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
|
||||||
|
this->m_inverseShuffle,
|
||||||
|
this->m_unshuffledInputStrides, this->m_impl.data());
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,22 +18,30 @@ using Eigen::RowMajor;
|
|||||||
using Eigen::ColMajor;
|
using Eigen::ColMajor;
|
||||||
|
|
||||||
// A set of tests to verify that different TensorExecutor strategies yields the
|
// A set of tests to verify that different TensorExecutor strategies yields the
|
||||||
// same results for all the ops, supporting tiled execution.
|
// same results for all the ops, supporting tiled evaluation.
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
||||||
|
array<Index, NumDims> dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
dims[i] = internal::random<int>(min_dim, max_dim);
|
||||||
|
}
|
||||||
|
return dims;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_binary_expr(Device d)
|
||||||
|
{
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
template <typename Device, bool Vectorizable, bool Tileable, int Layout>
|
|
||||||
static void test_execute_binary_expr(Device d) {
|
|
||||||
// Pick a large enough tensor size to bypass small tensor block evaluation
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
||||||
// optimization.
|
// optimization.
|
||||||
int d0 = internal::random<int>(100, 200);
|
auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
|
||||||
int d1 = internal::random<int>(100, 200);
|
|
||||||
int d2 = internal::random<int>(100, 200);
|
|
||||||
|
|
||||||
static constexpr int Options = 0;
|
Tensor<T, NumDims, Options, Index> lhs(dims);
|
||||||
using IndexType = int;
|
Tensor<T, NumDims, Options, Index> rhs(dims);
|
||||||
|
Tensor<T, NumDims, Options, Index> dst(dims);
|
||||||
Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
|
|
||||||
Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
|
|
||||||
Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
|
|
||||||
|
|
||||||
lhs.setRandom();
|
lhs.setRandom();
|
||||||
rhs.setRandom();
|
rhs.setRandom();
|
||||||
@ -46,33 +54,389 @@ static void test_execute_binary_expr(Device d) {
|
|||||||
|
|
||||||
Executor::run(Assign(dst, expr), d);
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
for (int i = 0; i < d0; ++i) {
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
for (int j = 0; j < d1; ++j) {
|
T sum = lhs.coeff(i) + rhs.coeff(i);
|
||||||
for (int k = 0; k < d2; ++k) {
|
VERIFY_IS_EQUAL(sum, dst.coeff(i));
|
||||||
float sum = lhs(i, j, k) + rhs(i, j, k);
|
|
||||||
VERIFY_IS_EQUAL(sum, dst(i, j, k));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CALL_SUBTEST_COMBINATIONS(NAME) \
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
|
bool Tileable, int Layout>
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
|
static void test_execute_broadcasting(Device d)
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
|
{
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
|
static constexpr int Options = 0 | Layout;
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
|
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
|
auto dims = RandomDims<NumDims>(1, 10);
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
|
src.setRandom();
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
const auto broadcasts = RandomDims<NumDims>(1, 7);
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
|
const auto expr = src.broadcast(broadcasts);
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
|
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
// We assume that broadcasting on a default device is tested and correct, so
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
|
Tensor<T, NumDims, Options, Index> golden;
|
||||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
|
golden = expr;
|
||||||
|
|
||||||
|
// Now do the broadcasting using configured tensor executor.
|
||||||
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_chipping_rvalue(Device d) {
|
||||||
|
auto dims = RandomDims<NumDims>(1, 10);
|
||||||
|
Tensor<T, NumDims, Layout, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
#define TEST_CHIPPING(CHIP_DIM) \
|
||||||
|
if (NumDims > (CHIP_DIM)) { \
|
||||||
|
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
||||||
|
const auto expr = src.template chip<(CHIP_DIM)>(offset); \
|
||||||
|
\
|
||||||
|
Tensor<T, NumDims - 1, Layout, Index> golden; \
|
||||||
|
golden = expr; \
|
||||||
|
\
|
||||||
|
Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
|
||||||
|
\
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
|
||||||
|
using Executor = internal::TensorExecutor<const Assign, Device, \
|
||||||
|
Vectorizable, Tileable>; \
|
||||||
|
\
|
||||||
|
Executor::run(Assign(dst, expr), d); \
|
||||||
|
\
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CHIPPING(0)
|
||||||
|
TEST_CHIPPING(1)
|
||||||
|
TEST_CHIPPING(2)
|
||||||
|
TEST_CHIPPING(3)
|
||||||
|
TEST_CHIPPING(4)
|
||||||
|
TEST_CHIPPING(5)
|
||||||
|
|
||||||
|
#undef TEST_CHIPPING
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_chipping_lvalue(Device d) {
|
||||||
|
auto dims = RandomDims<NumDims>(1, 10);
|
||||||
|
|
||||||
|
#define TEST_CHIPPING(CHIP_DIM) \
|
||||||
|
if (NumDims > (CHIP_DIM)) { \
|
||||||
|
/* Generate random data that we'll assign to the chipped tensor dim. */ \
|
||||||
|
array<Index, NumDims - 1> src_dims; \
|
||||||
|
for (int i = 0; i < NumDims - 1; ++i) { \
|
||||||
|
int dim = i < (CHIP_DIM) ? i : i + 1; \
|
||||||
|
src_dims[i] = dims[dim]; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
|
||||||
|
src.setRandom(); \
|
||||||
|
\
|
||||||
|
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
||||||
|
\
|
||||||
|
/* Generate random data to fill non-chipped dimensions*/ \
|
||||||
|
Tensor<T, NumDims, Layout, Index> random(dims); \
|
||||||
|
random.setRandom(); \
|
||||||
|
\
|
||||||
|
Tensor<T, NumDims, Layout, Index> golden(dims); \
|
||||||
|
golden = random; \
|
||||||
|
golden.template chip<(CHIP_DIM)>(offset) = src; \
|
||||||
|
\
|
||||||
|
Tensor<T, NumDims, Layout, Index> dst(dims); \
|
||||||
|
dst = random; \
|
||||||
|
auto expr = dst.template chip<(CHIP_DIM)>(offset); \
|
||||||
|
\
|
||||||
|
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
|
||||||
|
using Executor = internal::TensorExecutor<const Assign, Device, \
|
||||||
|
Vectorizable, Tileable>; \
|
||||||
|
\
|
||||||
|
Executor::run(Assign(expr, src), d); \
|
||||||
|
\
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CHIPPING(0)
|
||||||
|
TEST_CHIPPING(1)
|
||||||
|
TEST_CHIPPING(2)
|
||||||
|
TEST_CHIPPING(3)
|
||||||
|
TEST_CHIPPING(4)
|
||||||
|
TEST_CHIPPING(5)
|
||||||
|
|
||||||
|
#undef TEST_CHIPPING
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_shuffle_rvalue(Device d) {
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(1, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Create a random dimension re-ordering/shuffle.
|
||||||
|
std::vector<Index> shuffle;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||||
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
||||||
|
|
||||||
|
const auto expr = src.shuffle(shuffle);
|
||||||
|
|
||||||
|
// We assume that shuffling on a default device is tested and correct, so
|
||||||
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
|
Tensor<T, NumDims, Options, Index> golden;
|
||||||
|
golden = expr;
|
||||||
|
|
||||||
|
// Now do the shuffling using configured tensor executor.
|
||||||
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_shuffle_lvalue(Device d) {
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(5, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Create a random dimension re-ordering/shuffle.
|
||||||
|
std::vector<Index> shuffle;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||||
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
||||||
|
|
||||||
|
array<Index, NumDims> shuffled_dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
|
||||||
|
|
||||||
|
// We assume that shuffling on a default device is tested and correct, so
|
||||||
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
|
Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
|
||||||
|
golden.shuffle(shuffle) = src;
|
||||||
|
|
||||||
|
// Now do the shuffling using configured tensor executor.
|
||||||
|
Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
|
||||||
|
|
||||||
|
auto expr = dst.shuffle(shuffle);
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(expr, src), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_reduction(Device d)
|
||||||
|
{
|
||||||
|
static_assert(NumDims >= 2);
|
||||||
|
|
||||||
|
static constexpr int ReducedDims = NumDims - 2;
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(5, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Pick two random and unique reduction dimensions.
|
||||||
|
int reduction0 = internal::random<int>(0, NumDims - 1);
|
||||||
|
int reduction1 = internal::random<int>(0, NumDims - 1);
|
||||||
|
while (reduction0 == reduction1) {
|
||||||
|
reduction1 = internal::random<int>(0, NumDims - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
DSizes<Index, 2> reduction_axis;
|
||||||
|
reduction_axis[0] = reduction0;
|
||||||
|
reduction_axis[1] = reduction1;
|
||||||
|
|
||||||
|
Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
|
||||||
|
|
||||||
|
// Now do the reduction using configured tensor executor.
|
||||||
|
Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
auto expr = src.sum(reduction_axis);
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_reshape(Device d)
|
||||||
|
{
|
||||||
|
static_assert(NumDims >= 2);
|
||||||
|
|
||||||
|
static constexpr int ReshapedDims = NumDims - 1;
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(5, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Multiple 0th dimension and then shuffle.
|
||||||
|
std::vector<Index> shuffle;
|
||||||
|
for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
|
||||||
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
||||||
|
|
||||||
|
DSizes<Index, ReshapedDims> reshaped_dims;
|
||||||
|
reshaped_dims[shuffle[0]] = dims[0] * dims[1];
|
||||||
|
for (int i = 2; i < NumDims; ++i) reshaped_dims[shuffle[i]] = dims[i];
|
||||||
|
|
||||||
|
Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
|
||||||
|
|
||||||
|
// Now reshape using configured tensor executor.
|
||||||
|
Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
auto expr = src.reshape(reshaped_dims);
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_slice_rvalue(Device d)
|
||||||
|
{
|
||||||
|
static_assert(NumDims >= 2);
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(5, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Pick a random slice of src tensor.
|
||||||
|
auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
|
||||||
|
auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
|
||||||
|
|
||||||
|
// Make sure that slice start + size do not overflow tensor dims.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
||||||
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor<T, NumDims, Options, Index> golden =
|
||||||
|
src.slice(slice_start, slice_size);
|
||||||
|
|
||||||
|
// Now reshape using configured tensor executor.
|
||||||
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
auto expr = src.slice(slice_start, slice_size);
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_slice_lvalue(Device d)
|
||||||
|
{
|
||||||
|
static_assert(NumDims >= 2);
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(5, 10);
|
||||||
|
Tensor<T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Pick a random slice of src tensor.
|
||||||
|
auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
|
||||||
|
auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
|
||||||
|
|
||||||
|
// Make sure that slice start + size do not overflow tensor dims.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
||||||
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor<T, NumDims, Options, Index> slice(slice_size);
|
||||||
|
slice.setRandom();
|
||||||
|
|
||||||
|
// Asign a slice using default executor.
|
||||||
|
Tensor<T, NumDims, Options, Index> golden = src;
|
||||||
|
golden.slice(slice_start, slice_size) = slice;
|
||||||
|
|
||||||
|
// And using configured execution strategy.
|
||||||
|
Tensor<T, NumDims, Options, Index> dst = src;
|
||||||
|
auto expr = dst.slice(slice_start, slice_size);
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(expr, slice), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_SUBTEST_COMBINATIONS(NAME, T, NUM_DIMS) \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, false, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, true, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, false, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, true, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, RowMajor>(tp_device)))
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||||
Eigen::DefaultDevice default_device;
|
Eigen::DefaultDevice default_device;
|
||||||
@ -81,7 +445,49 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
Eigen::ThreadPool tp(num_threads);
|
Eigen::ThreadPool tp(num_threads);
|
||||||
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
|
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 2);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 2);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 2);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 2);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 5);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CALL_SUBTEST_COMBINATIONS
|
#undef CALL_SUBTEST_COMBINATIONS
|
||||||
|
Loading…
x
Reference in New Issue
Block a user