Tensor block evaluation V2 support for unary/binary/broadcsting

This commit is contained in:
Eugene Zhulenev 2019-09-24 12:52:45 -07:00
parent efd9867ff0
commit ef9dfee7bd
40 changed files with 1162 additions and 169 deletions

View File

@ -78,9 +78,6 @@ struct conditional { typedef Then type; };
template<typename Then, typename Else>
struct conditional <false, Then, Else> { typedef Else type; };
template<typename T, typename U> struct is_same { enum { value = 0 }; };
template<typename T> struct is_same<T,T> { enum { value = 1 }; };
template<typename T> struct remove_reference { typedef T type; };
template<typename T> struct remove_reference<T&> { typedef T type; };
@ -115,6 +112,12 @@ template<> struct is_arithmetic<unsigned int> { enum { value = true }; };
template<> struct is_arithmetic<signed long> { enum { value = true }; };
template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
template<typename T, typename U> struct is_same { enum { value = 0 }; };
template<typename T> struct is_same<T,T> { enum { value = 1 }; };
template< class T >
struct is_void : is_same<void, typename remove_const<T>::type> {};
#if EIGEN_HAS_CXX11
template<> struct is_arithmetic<signed long long> { enum { value = true }; };
template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };

View File

@ -114,6 +114,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorBase.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorBlockV2.h"
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"

View File

@ -89,12 +89,17 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device) { }
@ -226,12 +231,17 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),

View File

@ -109,12 +109,17 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
: m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
m_stride_div(op.stride_div()){}

View File

@ -110,6 +110,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
@ -120,6 +122,18 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
TensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
RightTensorBlock;
typedef internal::TensorBlockAssignment<
Scalar, NumDims, typename RightTensorBlock::XprType, Index>
TensorBlockAssignment;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
@ -214,6 +228,29 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_leftImpl.writeBlock(*block);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
m_leftImpl.data() != NULL) {
// If destination has raw data access, we pass it as a potential
// destination for a block descriptor evaluation.
desc.AddDestinationBuffer(
/*dst_base=*/m_leftImpl.data() + desc.offset(),
/*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
/*total_dst_bytes=*/
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
}
RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
// If block was evaluated into a destination, there is no need to do
// assignment.
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
m_leftImpl.writeBlockV2(desc, block);
}
block.cleanup();
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {

View File

@ -1025,6 +1025,11 @@ class TensorBlockMapper {
return m_block_dim_sizes.TotalSize();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&
block_dim_sizes() const {
return m_block_dim_sizes;
}
private:
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
const TensorBlockShapeType block_shape,

View File

@ -115,6 +115,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@ -131,11 +132,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// We do block based broadcasting using a trick with 2x tensor rank and 0
// strides. See block method implementation for details.
typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
BroadcastTensorBlock;
typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
BroadcastTensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
ArgTensorBlock;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
Layout, Index>
TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: isCopy(false), nByOne(false), oneByN(false),
@ -867,6 +881,292 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
static const bool
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
// Return a block with a single scalar.
if (NumDims <= 0) return scalarBlock(scratch);
// Because we only support kSkewedInnerDims blocking, block size should be
// equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
// for the first outer dim, and 1 for other outer dims. This is guaranteed
// by MergeResourceRequirements() in TensorBlock.h.
const Dimensions& output_dims = desc.dimensions();
const Dimensions output_strides = internal::strides<Layout>(output_dims);
// Find where outer dims start.
int outer_dim_start = 0;
Index outer_dim_size = 1;
Index inner_dim_size = 1;
for (int i = 0; i < NumDims; ++i) {
const int dim = is_col_major ? i : NumDims - i - 1;
if (i > outer_dim_start) {
eigen_assert(output_dims[dim] == 1);
} else if (output_dims[dim] != m_dimensions[dim]) {
eigen_assert(output_dims[dim] < m_dimensions[dim]);
outer_dim_size = output_dims[dim];
} else {
inner_dim_size *= output_dims[dim];
++outer_dim_start;
}
}
if (inner_dim_size == 0 || outer_dim_size == 0) {
return emptyBlock();
}
const Dimensions& input_dims = Dimensions(m_impl.dimensions());
// Pre-fill input_block_sizes, broadcast_block_sizes,
// broadcast_block_strides, and broadcast_tensor_strides. Later on we will
// only modify the outer_dim_start-th dimension on these arrays.
// Calculate the input block size for looking into the input.
Dimensions input_block_sizes;
for (int i = 0; i < outer_dim_start; ++i) {
const int dim = is_col_major ? i : NumDims -i - 1;
input_block_sizes[dim] = input_dims[dim];
}
for (int i = outer_dim_start; i < NumDims; ++i) {
const int dim = is_col_major ? i : NumDims -i - 1;
input_block_sizes[dim] = 1;
}
Dimensions input_block_strides =
internal::strides<Layout>(input_block_sizes);
// Broadcast with the 0-stride trick: Create 1 extra dim for each
// broadcast, set the input stride to 0.
//
// When ColMajor:
//
// - bcast_block_sizes:
// [d_0, b_0, d_1, b_1, ...]
//
// - bcast_block_strides:
// [output_block_strides[0], output_block_strides[0] * d_0,
// output_block_strides[1], output_block_strides[1] * d_1,
// ...]
//
// - bcast_input_strides:
// [input_block_strides[0], 0,
// input_block_strides[1], 0,
// ...].
//
BroadcastDimensions bcast_block_sizes;
BroadcastDimensions bcast_block_strides;
BroadcastDimensions bcast_input_strides;
for (int i = 0; i < outer_dim_start; ++i) {
const int dim = is_col_major ? i : NumDims - i - 1;
const int copy_dim = is_col_major ? 2 * i : 2 * NumDims - 2 * i - 1;
const int broadcast_dim = is_col_major ? copy_dim + 1 : copy_dim - 1;
bcast_block_sizes[copy_dim] = input_dims[dim];
bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
bcast_block_strides[copy_dim] = output_strides[dim];
bcast_block_strides[broadcast_dim] =
output_strides[dim] * input_dims[dim];
bcast_input_strides[copy_dim] = input_block_strides[dim];
bcast_input_strides[broadcast_dim] = 0;
}
for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
const int dim = is_col_major ? i : 2 * NumDims - i - 1;
bcast_block_sizes[dim] = 1;
bcast_block_strides[dim] = 0;
bcast_input_strides[dim] = 0;
}
const int outer_dim =
is_col_major ? outer_dim_start : NumDims - outer_dim_start - 1;
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
ScalarNoConst* materialized_output =
desc.template destination<ScalarNoConst, Layout>();
bool materialized_in_output;
if (materialized_output != NULL) {
desc.DropDestinationBuffer();
materialized_in_output = true;
} else {
materialized_in_output = false;
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
void* output_scratch_mem = scratch.allocate(materialized_output_size);
materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
}
size_t materialized_input_size = 0;
ScalarNoConst* materialized_input = NULL;
if (outer_dim_size == 1) {
// We just need one block read using the ready-set values above.
BroadcastBlockV2(
input_block_sizes, input_block_strides, bcast_block_sizes,
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
materialized_output, &materialized_input, &materialized_input_size);
} else if (input_dims[outer_dim] == 1) {
// Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
const int broadcast_outer_dim =
is_col_major ? 2 * outer_dim_start + 1
: 2 * NumDims - 2 * outer_dim_start - 2;
bcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
bcast_input_strides[broadcast_outer_dim] = 0;
bcast_block_strides[broadcast_outer_dim] = output_strides[outer_dim];
BroadcastBlockV2(
input_block_sizes, input_block_strides, bcast_block_sizes,
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
materialized_output, &materialized_input, &materialized_input_size);
} else {
// The general case. Let's denote the output block as x[...,
// a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
// the outer_dim_start-th dimension (< NumDims). We need to split the
// a:a+outer_dim_size into possibly 3 sub-blocks:
//
// (1) a:b, where b is the smallest multiple of
// input_dims[outer_dim_start] in [a, a+outer_dim_size].
//
// (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
// in [a, a+outer_dim_size].
//
// (3) c:a+outer_dim_size .
//
// Or, when b and c do not exist, we just need to process the whole block
// together.
// Find a.
const Index outer_dim_left_index =
desc.offset() / m_outputStrides[outer_dim];
// Find b and c.
const Index input_outer_dim_size = input_dims[outer_dim];
// First multiple after a. This is b when <= outer_dim_left_index +
// outer_dim_size.
const Index first_multiple =
divup<Index>(outer_dim_left_index, input_outer_dim_size) *
input_outer_dim_size;
if (first_multiple <= outer_dim_left_index + outer_dim_size) {
// b exists, so does c. Find it.
const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
input_outer_dim_size * input_outer_dim_size;
const int copy_outer_dim = is_col_major
? 2 * outer_dim_start
: 2 * NumDims - 2 * outer_dim_start - 1;
const int broadcast_outer_dim =
is_col_major ? 2 * outer_dim_start + 1
: 2 * NumDims - 2 * outer_dim_start - 2;
if (first_multiple > outer_dim_left_index) {
const Index head_size = first_multiple - outer_dim_left_index;
input_block_sizes[outer_dim] = head_size;
bcast_block_sizes[copy_outer_dim] = head_size;
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
bcast_block_sizes[broadcast_outer_dim] = 1;
bcast_input_strides[broadcast_outer_dim] = 0;
bcast_block_strides[broadcast_outer_dim] =
output_strides[outer_dim] * input_dims[outer_dim];
BroadcastBlockV2(input_block_sizes, input_block_strides,
bcast_block_sizes, bcast_block_strides,
bcast_input_strides, 0, desc, scratch,
materialized_output, &materialized_input,
&materialized_input_size);
}
if (first_multiple < last_multiple) {
input_block_sizes[outer_dim] = input_outer_dim_size;
bcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
bcast_block_sizes[broadcast_outer_dim] =
(last_multiple - first_multiple) / input_outer_dim_size;
bcast_input_strides[broadcast_outer_dim] = 0;
bcast_block_strides[broadcast_outer_dim] =
output_strides[outer_dim] * input_dims[outer_dim];
const Index offset = (first_multiple - outer_dim_left_index) *
m_outputStrides[outer_dim];
BroadcastBlockV2(input_block_sizes, input_block_strides,
bcast_block_sizes, bcast_block_strides,
bcast_input_strides, offset, desc, scratch,
materialized_output, &materialized_input,
&materialized_input_size);
}
if (last_multiple < outer_dim_left_index + outer_dim_size) {
const Index tail_size =
outer_dim_left_index + outer_dim_size - last_multiple;
input_block_sizes[outer_dim] = tail_size;
bcast_block_sizes[copy_outer_dim] = tail_size;
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
bcast_block_sizes[broadcast_outer_dim] = 1;
bcast_input_strides[broadcast_outer_dim] = 0;
bcast_block_strides[broadcast_outer_dim] =
output_strides[outer_dim] * input_dims[outer_dim];
const Index offset = (last_multiple - outer_dim_left_index) *
m_outputStrides[outer_dim];
BroadcastBlockV2(input_block_sizes, input_block_strides,
bcast_block_sizes, bcast_block_strides,
bcast_input_strides, offset, desc, scratch,
materialized_output, &materialized_input,
&materialized_input_size);
}
} else {
// b and c do not exist.
const int copy_outer_dim = is_col_major
? 2 * outer_dim_start
: 2 * NumDims - 2 * outer_dim_start - 1;
input_block_sizes[outer_dim] = outer_dim_size;
bcast_block_sizes[copy_outer_dim] = outer_dim_size;
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
BroadcastBlockV2(
input_block_sizes, input_block_strides, bcast_block_sizes,
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
materialized_output, &materialized_input, &materialized_input_size);
}
}
return TensorBlockV2(materialized_in_output
? internal::TensorBlockKind::kMaterializedInOutput
: internal::TensorBlockKind::kMaterializedInScratch,
materialized_output,
desc.dimensions());
}
// This is a special case for `NumDims == 0`, in practice this should not
// happen often, so it's fine to do memory allocation just for a scalar.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
scalarBlock(TensorBlockScratch& scratch) const {
void* mem = scratch.allocate(sizeof(Scalar));
ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
*buf = m_impl.coeff(0);
DSizes<Index, NumDims> dimensions;
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, buf,
dimensions);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
DSizes<Index, NumDims> dimensions;
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@ -901,6 +1201,73 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlockV2(
const Dimensions& input_block_sizes,
const Dimensions& input_block_strides,
const BroadcastDimensions& bcast_block_sizes,
const BroadcastDimensions& bcast_block_strides,
const BroadcastDimensions& bcast_input_strides, Index offset,
const TensorBlockDesc& output_desc, TensorBlockScratch& scratch,
ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
size_t* materialized_input_size) const {
// ---------------------------------------------------------------------- //
// Tensor block descriptor for reading block from the input.
const Index input_offset = output_desc.offset() + offset;
static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
TensorBlockDesc input_desc(is_col_major
? indexColMajor(input_offset)
: indexRowMajor(input_offset),
input_block_sizes);
ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch);
// ---------------------------------------------------------------------- //
// Materialize input block into a temporary memory buffer only if it's not
// already available in the arg block.
const ScalarNoConst* input_buffer = NULL;
if (input_block.data() != NULL) {
// Input block already has raw data, there is no need to materialize it.
input_buffer = input_block.data();
} else {
// Otherwise we have to do block assignment into a temporary buffer.
// Maybe reuse previously allocated buffer, or allocate a new one with a
// scratch allocator.
const size_t input_total_size = input_block_sizes.TotalSize();
if (*materialized_input == NULL ||
*materialized_input_size < input_total_size) {
*materialized_input_size = input_total_size;
void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
*materialized_input = static_cast<ScalarNoConst*>(mem);
}
typedef internal::TensorBlockAssignment<
ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
TensorBlockAssignment;
typename TensorBlockAssignment::Dst assignment_dst(
input_block_sizes, input_block_strides, *materialized_input);
TensorBlockAssignment::Run(assignment_dst, input_block.expr());
input_buffer = *materialized_input;
}
// ---------------------------------------------------------------------- //
// Copy data from materialized input block to the materialized output, using
// given broadcast strides (strides with zeroes).
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, 2 * NumDims, Layout>
TensorBlockIOV2;
typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer);
typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides,
materialized_output + offset);
TensorBlockIOV2::Copy(dst, src);
}
protected:
const Device EIGEN_DEVICE_REF m_device;
const typename internal::remove_reference<Broadcast>::type m_broadcast;

View File

@ -149,6 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Layout = TensorEvaluator<ArgType, Device>::Layout,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = false,
// Chipping of outer-most dimension is a trivial operation, because we can
// read and write directly from the underlying tensor using single offset.
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
@ -169,6 +170,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
OutputTensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{

View File

@ -125,11 +125,16 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
@ -318,11 +323,16 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
: Base(op, device)
{

View File

@ -376,12 +376,17 @@ struct TensorContractionEvaluatorBase
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code

View File

@ -303,11 +303,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
#endif
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{

View File

@ -310,12 +310,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
@ -783,12 +788,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{

View File

@ -243,12 +243,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{

View File

@ -96,12 +96,17 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{
@ -265,12 +270,17 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{

View File

@ -383,8 +383,17 @@ struct DSizes : array<DenseIndex, NumDims> {
}
};
template <typename IndexType, int NumDims>
std::ostream& operator<<(std::ostream& os,
const DSizes<IndexType, NumDims>& dims) {
os << "[";
for (int i = 0; i < NumDims; ++i) {
if (i > 0) os << ", ";
os << dims[i];
}
os << "]";
return os;
}
// Boilerplate
namespace internal {

View File

@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -124,6 +125,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
TensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}

View File

@ -43,13 +43,14 @@ struct TensorEvaluator
internal::traits<Derived>::NumDimensions : 0;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
BlockAccessV2 = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
typedef typename internal::TensorBlock<
@ -62,6 +63,10 @@ struct TensorEvaluator
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockWriter;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
m_dims(m.dimensions()),
@ -162,6 +167,22 @@ struct TensorEvaluator
TensorBlockWriter::Run(block, m_data);
}
template<typename TensorBlockV2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
const TensorBlockDesc& desc, const TensorBlockV2& block) {
assert(m_data != NULL);
typedef typename TensorBlockV2::XprType TensorBlockExpr;
typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
Index>
TensorBlockAssign;
typename TensorBlockAssign::Dst dst(desc.dimensions(),
internal::strides<Layout>(m_dims),
m_data, desc.offset());
TensorBlockAssign::Run(dst, block.expr());
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
@ -220,28 +241,43 @@ struct TensorEvaluator<const Derived, Device>
typedef StorageMemory<const Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
BlockAccessV2 = internal::is_arithmetic<ScalarNoConst>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
TensorBlock;
typedef typename internal::TensorBlockReader<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
TensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
TensorBlockIO;
typedef typename TensorBlockIO::Dst TensorBlockIODst;
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
Layout, Index>
TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
@ -310,6 +346,67 @@ struct TensorEvaluator<const Derived, Device>
TensorBlockReader::Run(block, m_data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
assert(m_data != NULL);
// TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
// If a tensor block descriptor covers a contiguous block of the underlying
// memory, we can skip block buffer memory allocation, and construct a block
// from existing `m_data` memory buffer.
//
// Example: (RowMajor layout)
// m_dims: [11, 12, 13, 14]
// desc.dimensions(): [1, 1, 3, 14]
//
// In this case we can construct a TensorBlock starting at
// `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
static const bool
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
// Find out how many inner dimensions have a matching size.
int num_matching_inner_dims = 0;
for (int i = 0; i < NumCoords; ++i) {
int dim = is_col_major ? i : NumCoords - i - 1;
if (m_dims[dim] != desc.dimensions()[dim]) break;
++num_matching_inner_dims;
}
// All the outer dimensions must be of size `1`, except a single dimension
// before the matching inner dimension (`3` in the example above).
bool can_use_direct_access = true;
for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
int dim = is_col_major ? i : NumCoords - i - 1;
if (desc.dimension(dim) != 1) {
can_use_direct_access = false;
break;
}
}
if (can_use_direct_access) {
EvaluatorPointerType block_start = m_data + desc.offset();
return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
desc.dimensions());
} else {
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
desc.offset());
TensorBlockIODst dst(desc.dimensions(),
internal::strides<Layout>(desc.dimensions()),
block_buffer);
TensorBlockIO::Copy(dst, src);
return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
block_buffer, desc.dimensions());
}
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
@ -355,12 +452,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
#endif
,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
@ -421,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
internal::functor_traits<UnaryOp>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -446,6 +549,17 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
TensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
ArgTensorBlock;
typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
@ -505,6 +619,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
arg_block.data());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
@ -537,6 +656,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
internal::functor_traits<BinaryOp>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
@ -571,6 +692,20 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorEvaluator<LeftArgType, Device>::Layout>
TensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlockV2
LeftTensorBlock;
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
RightTensorBlock;
typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
RightTensorBlock>
TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use right impl instead if right impl dimensions are known at compile time.
@ -642,6 +777,13 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
right_block.block_strides(), right_block.data());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
desc.DropDestinationBuffer();
return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
m_rightImpl.blockV2(desc, scratch), m_functor);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
@ -670,6 +812,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented
@ -709,6 +852,10 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use arg2 or arg3 dimensions if they are known at compile time.
@ -780,6 +927,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
PacketType<Scalar, Device>::HasBlend,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -805,6 +953,10 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use then or else impl instead if they happen to be known at compile time.

View File

@ -23,7 +23,7 @@ namespace Eigen {
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
* @tparam Tileable can use block based tensor evaluation
* @tparam Tiling can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal {
@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp<
* Default strategy: the expression is evaluated sequentially with a single cpu
* thread, without vectorization and block evaluation.
*/
#if EIGEN_HAS_CXX11
template <typename Expression, typename Device, bool Vectorizable,
bool Tileable>
TiledEvaluation Tiling>
#else
template <typename Expression, typename Device, bool Vectorizable,
TiledEvaluation::TiledEvaluation Tiling>
#endif
class TensorExecutor {
public:
typedef typename Expression::Index StorageIndex;
@ -109,8 +114,8 @@ class TensorAsyncExecutor {};
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
/*Tileable*/ false> {
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
/*Tiling=*/TiledEvaluation::Off> {
public:
typedef typename Expression::Index StorageIndex;
@ -152,7 +157,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tileable*/ true> {
/*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
@ -176,8 +181,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
if (total_size < cache_size
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tileable*/ false>::run(expr, device);
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
evaluator.cleanup();
return;
}
@ -211,6 +215,70 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
}
};
/**
* Process all the data with a single cpu thread, using blocks of data. By
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
typedef typename traits<Expression>::Index StorageIndex;
static const int NumDims = traits<Expression>::NumDimensions;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<DefaultDevice>
TensorBlockScratch;
Evaluator evaluator(expr, device);
Index total_size = array_prod(evaluator.dimensions());
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
// TODO(ezhulenev): Do not use tiling for small tensors?
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Size tensor blocks to fit in cache (or requested target block size).
Index block_total_size = numext::mini(cache_size, total_size);
TensorBlockShapeType block_shape = kSkewedInnerDims;
// Query expression tree for desired block size/shape.
std::vector<TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
TensorBlockMapper block_mapper(
TensorBlockDimensions(evaluator.dimensions()), block_shape,
block_total_size);
block_total_size = block_mapper.block_dims_total_size();
// Share scratch memory allocator between all blocks.
TensorBlockScratch scratch(device);
const StorageIndex total_block_count = block_mapper.total_block_count();
for (StorageIndex i = 0; i < total_block_count; ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
evaluator.evalBlockV2(desc, scratch);
scratch.reset();
}
}
evaluator.cleanup();
}
};
/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
@ -256,10 +324,11 @@ struct TensorExecutorTilingContext {
};
// Computes a block evaluation parameters, and allocates temporary memory buffer
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const ThreadPoolDevice& device, const Evaluator& evaluator) {
const ThreadPoolDevice& device, const Evaluator& evaluator,
bool allocate_buffer = true) {
// Prefer blocks skewed toward inner dimension.
TensorBlockShapeType block_shape = kSkewedInnerDims;
Index block_total_size = 0;
@ -284,7 +353,13 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const size_t aligned_blocksize =
align *
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
// TODO(ezhulenev): In new block evaluation framework there is no need for
// allocating temporary buffers, remove this after migration.
void* buf = NULL;
if (allocate_buffer) {
buf = device.allocate((num_threads + 1) * aligned_blocksize);
}
return {block_mapper, cost * block_size, buf, aligned_blocksize};
}
@ -344,8 +419,8 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
}
};
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
@ -369,7 +444,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Index StorageIndex;
typedef typename traits<Expression>::Scalar Scalar;
@ -387,11 +463,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
Index total_size = array_prod(evaluator.dimensions());
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
if (total_size < cache_size &&
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tileable*/ false>::run(expr, device);
/*Tiling=*/TiledEvaluation::Off>::run(expr,
device);
evaluator.cleanup();
return;
}
@ -419,6 +496,57 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
}
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
static const int NumDims = traits<Expression>::NumDimensions;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
Evaluator::Layout>
BlockMapper;
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
typedef internal::TensorBlockDescriptor<NumDims, IndexType>
TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
TensorBlockScratch;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const TilingContext tiling =
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
Vectorizable>(
device, evaluator, /*allocate_buffer=*/false);
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
IndexType lastBlockIdx) {
TensorBlockScratch scratch(device);
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
evaluator.evalBlockV2(desc, scratch);
scratch.reset();
}
};
device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost,
eval_block);
}
evaluator.cleanup();
}
};
template <typename Expression, typename DoneCallback, bool Vectorizable,
bool Tileable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
@ -562,8 +690,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
}
/*static*/
template <typename Expression, bool Vectorizable, bool Tileable>
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
@ -711,8 +839,8 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
range_, vectorizable_threads_, evaluator) {}
};
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index Index;
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {

View File

@ -138,12 +138,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
IsAligned = false,
PacketAccess = true,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {

View File

@ -42,12 +42,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
typedef Dimensions_ Dimensions;
static const std::size_t NumIndices = Dimensions::count;

View File

@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
@ -109,6 +110,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
TensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_op(op.expression()),
m_device(device), m_buffer(NULL)
@ -132,13 +137,13 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
#endif
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
internal::TensorExecutor<const EvalTo,
typename internal::remove_const<Device>::type,
Vectorize, Tile>::run(evalToTmp, m_device);
internal::TensorExecutor<
const EvalTo, typename internal::remove_const<Device>::type,
/*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
/*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
run(evalToTmp, m_device);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {

View File

@ -154,23 +154,61 @@ struct IsVectorizable<GpuDevice, Expression> {
TensorEvaluator<Expression, GpuDevice>::IsAligned;
};
// Tiled evaluation strategy.
#if !EIGEN_HAS_CXX11
// To be able to use `TiledEvaluation::Off` in C++03 we need a namespace.
// (Use of enumeration in a nested name specifier is a c++11 extension).
namespace TiledEvaluation {
#endif
enum TiledEvaluation {
Off = 0, // tiled evaluation is not supported
On = 1, // still work in progress (see TensorBlockV2.h)
Legacy = 2 // soon to be deprecated (see TensorBock.h)
};
#if !EIGEN_HAS_CXX11
} // namespace TiledEvaluation
#endif
template <typename Device, typename Expression>
struct IsTileable {
#if !EIGEN_HAS_CXX11
typedef TiledEvaluation::TiledEvaluation TiledEvaluation;
#endif
// Check that block evaluation is supported and it's a preferred option (at
// least one sub-expression has much faster block evaluation, e.g.
// broadcasting).
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
TensorEvaluator<Expression, Device>::PreferBlockAccess;
static const bool BlockAccess =
TensorEvaluator<Expression, Device>::BlockAccess &&
TensorEvaluator<Expression, Device>::PreferBlockAccess;
static const bool BlockAccessV2 =
TensorEvaluator<Expression, Device>::BlockAccessV2 &&
TensorEvaluator<Expression, Device>::PreferBlockAccess;
static const TiledEvaluation value =
BlockAccessV2
? TiledEvaluation::On
: (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
};
#if EIGEN_HAS_CXX11
template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
class TensorExecutor;
#else
template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
bool Tileable = IsTileable<Device, Expression>::value>
TiledEvaluation::TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
class TensorExecutor;
#endif
// TODO(ezhulenev): Add TiledEvaluation support to async executor.
template <typename Expression, typename Device, typename DoneCallback,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
bool Tileable = IsTileable<Device, Expression>::value>
bool Tileable = IsTileable<Device, Expression>::BlockAccess>
class TensorAsyncExecutor;
} // end namespace internal

View File

@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = true,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -105,6 +106,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
TensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_device(device), m_generator(op.generator())
{

View File

@ -232,6 +232,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@ -241,6 +242,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
OutputTensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
: m_device(device), m_impl(op.expression(), device)
{

View File

@ -93,12 +93,17 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_strides(op.strides())
{

View File

@ -120,12 +120,17 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
@ -195,11 +200,16 @@ template<typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }

View File

@ -121,6 +121,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
TensorEvaluator<ArgType, Device>::RawAccess &&
NumInputDims > 0 && NumOutputDims > 0,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -137,6 +138,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
Layout>
OutputTensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
{
@ -363,6 +368,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -378,6 +384,10 @@ template<typename NewDimensions, typename ArgType, typename Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
@ -532,6 +542,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@ -543,6 +554,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
{
@ -813,6 +828,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@ -824,6 +840,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
@ -1002,11 +1022,16 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_device(device),
@ -1179,12 +1204,17 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }

View File

@ -99,12 +99,17 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
{

View File

@ -97,12 +97,17 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{

View File

@ -585,6 +585,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -598,6 +599,10 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
InputTensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);

View File

@ -142,12 +142,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
}
@ -374,12 +379,17 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
: m_ref(m)
{ }
@ -423,10 +433,15 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
{ }

View File

@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -128,6 +129,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
OutputTensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_impl(op.expression(), device),
@ -400,6 +405,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -414,6 +420,10 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }

View File

@ -100,12 +100,17 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_impl(op.expression(), device),

View File

@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@ -129,6 +130,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
TensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_device(device),
@ -426,6 +431,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@ -438,6 +444,10 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
TensorBlockWriter;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }

View File

@ -115,12 +115,17 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{

View File

@ -98,12 +98,17 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_traceDim(1), m_device(device)
{

View File

@ -184,12 +184,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
m_impl(op.expression(), device)
{

View File

@ -119,6 +119,7 @@ ei_add_test(cxx11_tensor_dimension)
ei_add_test(cxx11_tensor_map)
ei_add_test(cxx11_tensor_assign)
ei_add_test(cxx11_tensor_block_access)
ei_add_test(cxx11_tensor_broadcasting)
ei_add_test(cxx11_tensor_comparisons)
ei_add_test(cxx11_tensor_forced_eval)
ei_add_test(cxx11_tensor_math)
@ -188,8 +189,9 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_of_complex)
ei_add_test(cxx11_tensor_of_strings)
ei_add_test(cxx11_tensor_lvalue)
ei_add_test(cxx11_tensor_broadcasting)
ei_add_test(cxx11_tensor_chipping)
ei_add_test(cxx11_tensor_block_eval)
ei_add_test(cxx11_tensor_block_io)
ei_add_test(cxx11_tensor_concatenation)
ei_add_test(cxx11_tensor_inflation)
ei_add_test(cxx11_tensor_morphing)

View File

@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
}
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
tensor.resize(11,3,5);
#else
array<Index, 3> new_dims;
new_dims[0] = 11;
new_dims[1] = 3;
new_dims[2] = 5;
tensor.resize(new_dims);
#endif
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);
@ -139,7 +148,16 @@ static void test_static_broadcasting()
}
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
tensor.resize(11,3,5);
#else
array<Index, 3> new_dims;
new_dims[0] = 11;
new_dims[1] = 3;
new_dims[2] = 5;
tensor.resize(new_dims);
#endif
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);

View File

@ -16,6 +16,7 @@
using Eigen::Tensor;
using Eigen::RowMajor;
using Eigen::ColMajor;
using Eigen::internal::TiledEvaluation;
// A set of tests to verify that different TensorExecutor strategies yields the
// same results for all the ops, supporting tiled evaluation.
@ -30,7 +31,7 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_unary_expr(Device d)
{
static constexpr int Options = 0 | Layout;
@ -47,7 +48,7 @@ static void test_execute_unary_expr(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -58,7 +59,7 @@ static void test_execute_unary_expr(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_binary_expr(Device d)
{
static constexpr int Options = 0 | Layout;
@ -78,7 +79,7 @@ static void test_execute_binary_expr(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -89,7 +90,7 @@ static void test_execute_binary_expr(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_broadcasting(Device d)
{
static constexpr int Options = 0 | Layout;
@ -111,7 +112,7 @@ static void test_execute_broadcasting(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -121,7 +122,7 @@ static void test_execute_broadcasting(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_chipping_rvalue(Device d)
{
auto dims = RandomDims<NumDims>(1, 10);
@ -140,7 +141,7 @@ static void test_execute_chipping_rvalue(Device d)
\
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
using Executor = internal::TensorExecutor<const Assign, Device, \
Vectorizable, Tileable>; \
Vectorizable, Tiling>; \
\
Executor::run(Assign(dst, expr), d); \
\
@ -160,7 +161,7 @@ static void test_execute_chipping_rvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_chipping_lvalue(Device d)
{
auto dims = RandomDims<NumDims>(1, 10);
@ -193,7 +194,7 @@ static void test_execute_chipping_lvalue(Device d)
\
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
using Executor = internal::TensorExecutor<const Assign, Device, \
Vectorizable, Tileable>; \
Vectorizable, Tiling>; \
\
Executor::run(Assign(expr, src), d); \
\
@ -213,7 +214,7 @@ static void test_execute_chipping_lvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_shuffle_rvalue(Device d)
{
static constexpr int Options = 0 | Layout;
@ -239,7 +240,7 @@ static void test_execute_shuffle_rvalue(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -249,7 +250,7 @@ static void test_execute_shuffle_rvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_shuffle_lvalue(Device d)
{
static constexpr int Options = 0 | Layout;
@ -278,7 +279,7 @@ static void test_execute_shuffle_lvalue(Device d)
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(expr, src), d);
@ -288,7 +289,7 @@ static void test_execute_shuffle_lvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_reduction(Device d)
{
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -320,7 +321,7 @@ static void test_execute_reduction(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -330,7 +331,7 @@ static void test_execute_reduction(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_reshape(Device d)
{
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -360,7 +361,7 @@ static void test_execute_reshape(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -370,7 +371,7 @@ static void test_execute_reshape(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_slice_rvalue(Device d)
{
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -400,7 +401,7 @@ static void test_execute_slice_rvalue(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -410,7 +411,7 @@ static void test_execute_slice_rvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_slice_lvalue(Device d)
{
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -443,7 +444,7 @@ static void test_execute_slice_lvalue(Device d)
using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(expr, slice), d);
@ -453,7 +454,7 @@ static void test_execute_slice_lvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_broadcasting_of_forced_eval(Device d)
{
static constexpr int Options = 0 | Layout;
@ -475,7 +476,7 @@ static void test_execute_broadcasting_of_forced_eval(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -497,7 +498,7 @@ struct DummyGenerator {
};
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_generator_op(Device d)
{
static constexpr int Options = 0 | Layout;
@ -518,7 +519,7 @@ static void test_execute_generator_op(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -528,7 +529,7 @@ static void test_execute_generator_op(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_execute_reverse_rvalue(Device d)
{
static constexpr int Options = 0 | Layout;
@ -553,7 +554,7 @@ static void test_execute_reverse_rvalue(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
Executor::run(Assign(dst, expr), d);
@ -563,7 +564,7 @@ static void test_execute_reverse_rvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_async_execute_unary_expr(Device d)
{
static constexpr int Options = 0 | Layout;
@ -584,7 +585,7 @@ static void test_async_execute_unary_expr(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using DoneCallback = decltype(on_done);
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
Vectorizable, Tileable>;
Vectorizable, Tiling>;
Executor::runAsync(Assign(dst, expr), d, on_done);
done.Wait();
@ -596,7 +597,7 @@ static void test_async_execute_unary_expr(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
bool Tileable, int Layout>
TiledEvaluation Tiling, int Layout>
static void test_async_execute_binary_expr(Device d)
{
static constexpr int Options = 0 | Layout;
@ -620,7 +621,7 @@ static void test_async_execute_binary_expr(Device d)
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using DoneCallback = decltype(on_done);
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
Vectorizable, Tileable>;
Vectorizable, Tiling>;
Executor::runAsync(Assign(dst, expr), d, on_done);
done.Wait();
@ -640,34 +641,57 @@ static void test_async_execute_binary_expr(Device d)
#define CALL_SUBTEST_PART(PART) \
CALL_SUBTEST_##PART
#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
// NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
@ -678,69 +702,69 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::ThreadPool tp(num_threads);
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 2);
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 3);
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 4);
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 5);
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 5);
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 5);
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 1);
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 3);
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 5);
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
@ -754,4 +778,3 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
}
#undef CALL_SUBTEST_COMBINATIONS