mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-14 02:13:13 +08:00
Tensor block evaluation V2 support for unary/binary/broadcsting
This commit is contained in:
parent
efd9867ff0
commit
ef9dfee7bd
@ -78,9 +78,6 @@ struct conditional { typedef Then type; };
|
||||
template<typename Then, typename Else>
|
||||
struct conditional <false, Then, Else> { typedef Else type; };
|
||||
|
||||
template<typename T, typename U> struct is_same { enum { value = 0 }; };
|
||||
template<typename T> struct is_same<T,T> { enum { value = 1 }; };
|
||||
|
||||
template<typename T> struct remove_reference { typedef T type; };
|
||||
template<typename T> struct remove_reference<T&> { typedef T type; };
|
||||
|
||||
@ -115,6 +112,12 @@ template<> struct is_arithmetic<unsigned int> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<signed long> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
|
||||
|
||||
template<typename T, typename U> struct is_same { enum { value = 0 }; };
|
||||
template<typename T> struct is_same<T,T> { enum { value = 1 }; };
|
||||
|
||||
template< class T >
|
||||
struct is_void : is_same<void, typename remove_const<T>::type> {};
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
template<> struct is_arithmetic<signed long long> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
|
||||
|
@ -114,6 +114,7 @@ typedef unsigned __int64 uint64_t;
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
#include "src/Tensor/TensorBlock.h"
|
||||
#include "src/Tensor/TensorBlockV2.h"
|
||||
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
|
@ -89,12 +89,17 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device) { }
|
||||
|
||||
@ -226,12 +231,17 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_orig_impl(op.expression(), device),
|
||||
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
|
||||
|
@ -109,12 +109,17 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
|
||||
: m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
|
||||
m_stride_div(op.stride_div()){}
|
||||
|
@ -110,6 +110,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
|
||||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
@ -120,6 +122,18 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
|
||||
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
|
||||
RightTensorBlock;
|
||||
|
||||
typedef internal::TensorBlockAssignment<
|
||||
Scalar, NumDims, typename RightTensorBlock::XprType, Index>
|
||||
TensorBlockAssignment;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
m_leftImpl(op.lhsExpression(), device),
|
||||
m_rightImpl(op.rhsExpression(), device)
|
||||
@ -214,6 +228,29 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
m_leftImpl.writeBlock(*block);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
|
||||
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
|
||||
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
|
||||
m_leftImpl.data() != NULL) {
|
||||
// If destination has raw data access, we pass it as a potential
|
||||
// destination for a block descriptor evaluation.
|
||||
desc.AddDestinationBuffer(
|
||||
/*dst_base=*/m_leftImpl.data() + desc.offset(),
|
||||
/*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
|
||||
/*total_dst_bytes=*/
|
||||
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
|
||||
}
|
||||
|
||||
RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
|
||||
// If block was evaluated into a destination, there is no need to do
|
||||
// assignment.
|
||||
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
|
||||
m_leftImpl.writeBlockV2(desc, block);
|
||||
}
|
||||
block.cleanup();
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
|
@ -1025,6 +1025,11 @@ class TensorBlockMapper {
|
||||
return m_block_dim_sizes.TotalSize();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&
|
||||
block_dim_sizes() const {
|
||||
return m_block_dim_sizes;
|
||||
}
|
||||
|
||||
private:
|
||||
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
|
||||
const TensorBlockShapeType block_shape,
|
||||
|
@ -115,6 +115,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
@ -131,11 +132,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
// We do block based broadcasting using a trick with 2x tensor rank and 0
|
||||
// strides. See block method implementation for details.
|
||||
typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
|
||||
BroadcastTensorBlock;
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
|
||||
BroadcastTensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
|
||||
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
|
||||
ArgTensorBlock;
|
||||
|
||||
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
|
||||
Layout, Index>
|
||||
TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: isCopy(false), nByOne(false), oneByN(false),
|
||||
@ -867,6 +881,292 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||
static const bool
|
||||
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
|
||||
// Return a block with a single scalar.
|
||||
if (NumDims <= 0) return scalarBlock(scratch);
|
||||
|
||||
// Because we only support kSkewedInnerDims blocking, block size should be
|
||||
// equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
|
||||
// for the first outer dim, and 1 for other outer dims. This is guaranteed
|
||||
// by MergeResourceRequirements() in TensorBlock.h.
|
||||
const Dimensions& output_dims = desc.dimensions();
|
||||
const Dimensions output_strides = internal::strides<Layout>(output_dims);
|
||||
|
||||
// Find where outer dims start.
|
||||
int outer_dim_start = 0;
|
||||
Index outer_dim_size = 1;
|
||||
Index inner_dim_size = 1;
|
||||
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = is_col_major ? i : NumDims - i - 1;
|
||||
|
||||
if (i > outer_dim_start) {
|
||||
eigen_assert(output_dims[dim] == 1);
|
||||
} else if (output_dims[dim] != m_dimensions[dim]) {
|
||||
eigen_assert(output_dims[dim] < m_dimensions[dim]);
|
||||
outer_dim_size = output_dims[dim];
|
||||
} else {
|
||||
inner_dim_size *= output_dims[dim];
|
||||
++outer_dim_start;
|
||||
}
|
||||
}
|
||||
|
||||
if (inner_dim_size == 0 || outer_dim_size == 0) {
|
||||
return emptyBlock();
|
||||
}
|
||||
|
||||
const Dimensions& input_dims = Dimensions(m_impl.dimensions());
|
||||
|
||||
// Pre-fill input_block_sizes, broadcast_block_sizes,
|
||||
// broadcast_block_strides, and broadcast_tensor_strides. Later on we will
|
||||
// only modify the outer_dim_start-th dimension on these arrays.
|
||||
|
||||
// Calculate the input block size for looking into the input.
|
||||
Dimensions input_block_sizes;
|
||||
for (int i = 0; i < outer_dim_start; ++i) {
|
||||
const int dim = is_col_major ? i : NumDims -i - 1;
|
||||
input_block_sizes[dim] = input_dims[dim];
|
||||
}
|
||||
for (int i = outer_dim_start; i < NumDims; ++i) {
|
||||
const int dim = is_col_major ? i : NumDims -i - 1;
|
||||
input_block_sizes[dim] = 1;
|
||||
}
|
||||
Dimensions input_block_strides =
|
||||
internal::strides<Layout>(input_block_sizes);
|
||||
|
||||
// Broadcast with the 0-stride trick: Create 1 extra dim for each
|
||||
// broadcast, set the input stride to 0.
|
||||
//
|
||||
// When ColMajor:
|
||||
//
|
||||
// - bcast_block_sizes:
|
||||
// [d_0, b_0, d_1, b_1, ...]
|
||||
//
|
||||
// - bcast_block_strides:
|
||||
// [output_block_strides[0], output_block_strides[0] * d_0,
|
||||
// output_block_strides[1], output_block_strides[1] * d_1,
|
||||
// ...]
|
||||
//
|
||||
// - bcast_input_strides:
|
||||
// [input_block_strides[0], 0,
|
||||
// input_block_strides[1], 0,
|
||||
// ...].
|
||||
//
|
||||
BroadcastDimensions bcast_block_sizes;
|
||||
BroadcastDimensions bcast_block_strides;
|
||||
BroadcastDimensions bcast_input_strides;
|
||||
|
||||
for (int i = 0; i < outer_dim_start; ++i) {
|
||||
const int dim = is_col_major ? i : NumDims - i - 1;
|
||||
|
||||
const int copy_dim = is_col_major ? 2 * i : 2 * NumDims - 2 * i - 1;
|
||||
const int broadcast_dim = is_col_major ? copy_dim + 1 : copy_dim - 1;
|
||||
|
||||
bcast_block_sizes[copy_dim] = input_dims[dim];
|
||||
bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
|
||||
bcast_block_strides[copy_dim] = output_strides[dim];
|
||||
bcast_block_strides[broadcast_dim] =
|
||||
output_strides[dim] * input_dims[dim];
|
||||
bcast_input_strides[copy_dim] = input_block_strides[dim];
|
||||
bcast_input_strides[broadcast_dim] = 0;
|
||||
}
|
||||
for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
|
||||
const int dim = is_col_major ? i : 2 * NumDims - i - 1;
|
||||
bcast_block_sizes[dim] = 1;
|
||||
bcast_block_strides[dim] = 0;
|
||||
bcast_input_strides[dim] = 0;
|
||||
}
|
||||
|
||||
const int outer_dim =
|
||||
is_col_major ? outer_dim_start : NumDims - outer_dim_start - 1;
|
||||
|
||||
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
|
||||
ScalarNoConst* materialized_output =
|
||||
desc.template destination<ScalarNoConst, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (materialized_output != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
|
||||
void* output_scratch_mem = scratch.allocate(materialized_output_size);
|
||||
materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
|
||||
}
|
||||
|
||||
size_t materialized_input_size = 0;
|
||||
ScalarNoConst* materialized_input = NULL;
|
||||
|
||||
if (outer_dim_size == 1) {
|
||||
// We just need one block read using the ready-set values above.
|
||||
BroadcastBlockV2(
|
||||
input_block_sizes, input_block_strides, bcast_block_sizes,
|
||||
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
|
||||
materialized_output, &materialized_input, &materialized_input_size);
|
||||
|
||||
} else if (input_dims[outer_dim] == 1) {
|
||||
// Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
|
||||
const int broadcast_outer_dim =
|
||||
is_col_major ? 2 * outer_dim_start + 1
|
||||
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||
|
||||
bcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
|
||||
bcast_input_strides[broadcast_outer_dim] = 0;
|
||||
bcast_block_strides[broadcast_outer_dim] = output_strides[outer_dim];
|
||||
|
||||
BroadcastBlockV2(
|
||||
input_block_sizes, input_block_strides, bcast_block_sizes,
|
||||
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
|
||||
materialized_output, &materialized_input, &materialized_input_size);
|
||||
|
||||
} else {
|
||||
// The general case. Let's denote the output block as x[...,
|
||||
// a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
|
||||
// the outer_dim_start-th dimension (< NumDims). We need to split the
|
||||
// a:a+outer_dim_size into possibly 3 sub-blocks:
|
||||
//
|
||||
// (1) a:b, where b is the smallest multiple of
|
||||
// input_dims[outer_dim_start] in [a, a+outer_dim_size].
|
||||
//
|
||||
// (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
|
||||
// in [a, a+outer_dim_size].
|
||||
//
|
||||
// (3) c:a+outer_dim_size .
|
||||
//
|
||||
// Or, when b and c do not exist, we just need to process the whole block
|
||||
// together.
|
||||
|
||||
// Find a.
|
||||
const Index outer_dim_left_index =
|
||||
desc.offset() / m_outputStrides[outer_dim];
|
||||
|
||||
// Find b and c.
|
||||
const Index input_outer_dim_size = input_dims[outer_dim];
|
||||
|
||||
// First multiple after a. This is b when <= outer_dim_left_index +
|
||||
// outer_dim_size.
|
||||
const Index first_multiple =
|
||||
divup<Index>(outer_dim_left_index, input_outer_dim_size) *
|
||||
input_outer_dim_size;
|
||||
|
||||
if (first_multiple <= outer_dim_left_index + outer_dim_size) {
|
||||
// b exists, so does c. Find it.
|
||||
const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
|
||||
input_outer_dim_size * input_outer_dim_size;
|
||||
const int copy_outer_dim = is_col_major
|
||||
? 2 * outer_dim_start
|
||||
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||
const int broadcast_outer_dim =
|
||||
is_col_major ? 2 * outer_dim_start + 1
|
||||
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||
|
||||
if (first_multiple > outer_dim_left_index) {
|
||||
const Index head_size = first_multiple - outer_dim_left_index;
|
||||
input_block_sizes[outer_dim] = head_size;
|
||||
bcast_block_sizes[copy_outer_dim] = head_size;
|
||||
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
|
||||
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
|
||||
bcast_block_sizes[broadcast_outer_dim] = 1;
|
||||
bcast_input_strides[broadcast_outer_dim] = 0;
|
||||
bcast_block_strides[broadcast_outer_dim] =
|
||||
output_strides[outer_dim] * input_dims[outer_dim];
|
||||
|
||||
BroadcastBlockV2(input_block_sizes, input_block_strides,
|
||||
bcast_block_sizes, bcast_block_strides,
|
||||
bcast_input_strides, 0, desc, scratch,
|
||||
materialized_output, &materialized_input,
|
||||
&materialized_input_size);
|
||||
}
|
||||
if (first_multiple < last_multiple) {
|
||||
input_block_sizes[outer_dim] = input_outer_dim_size;
|
||||
bcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
|
||||
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
|
||||
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
|
||||
bcast_block_sizes[broadcast_outer_dim] =
|
||||
(last_multiple - first_multiple) / input_outer_dim_size;
|
||||
bcast_input_strides[broadcast_outer_dim] = 0;
|
||||
bcast_block_strides[broadcast_outer_dim] =
|
||||
output_strides[outer_dim] * input_dims[outer_dim];
|
||||
const Index offset = (first_multiple - outer_dim_left_index) *
|
||||
m_outputStrides[outer_dim];
|
||||
|
||||
BroadcastBlockV2(input_block_sizes, input_block_strides,
|
||||
bcast_block_sizes, bcast_block_strides,
|
||||
bcast_input_strides, offset, desc, scratch,
|
||||
materialized_output, &materialized_input,
|
||||
&materialized_input_size);
|
||||
}
|
||||
if (last_multiple < outer_dim_left_index + outer_dim_size) {
|
||||
const Index tail_size =
|
||||
outer_dim_left_index + outer_dim_size - last_multiple;
|
||||
input_block_sizes[outer_dim] = tail_size;
|
||||
bcast_block_sizes[copy_outer_dim] = tail_size;
|
||||
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
|
||||
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
|
||||
bcast_block_sizes[broadcast_outer_dim] = 1;
|
||||
bcast_input_strides[broadcast_outer_dim] = 0;
|
||||
bcast_block_strides[broadcast_outer_dim] =
|
||||
output_strides[outer_dim] * input_dims[outer_dim];
|
||||
const Index offset = (last_multiple - outer_dim_left_index) *
|
||||
m_outputStrides[outer_dim];
|
||||
|
||||
BroadcastBlockV2(input_block_sizes, input_block_strides,
|
||||
bcast_block_sizes, bcast_block_strides,
|
||||
bcast_input_strides, offset, desc, scratch,
|
||||
materialized_output, &materialized_input,
|
||||
&materialized_input_size);
|
||||
}
|
||||
} else {
|
||||
// b and c do not exist.
|
||||
const int copy_outer_dim = is_col_major
|
||||
? 2 * outer_dim_start
|
||||
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||
input_block_sizes[outer_dim] = outer_dim_size;
|
||||
bcast_block_sizes[copy_outer_dim] = outer_dim_size;
|
||||
bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
|
||||
bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
|
||||
|
||||
BroadcastBlockV2(
|
||||
input_block_sizes, input_block_strides, bcast_block_sizes,
|
||||
bcast_block_strides, bcast_input_strides, 0, desc, scratch,
|
||||
materialized_output, &materialized_input, &materialized_input_size);
|
||||
}
|
||||
}
|
||||
|
||||
return TensorBlockV2(materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
materialized_output,
|
||||
desc.dimensions());
|
||||
}
|
||||
|
||||
// This is a special case for `NumDims == 0`, in practice this should not
|
||||
// happen often, so it's fine to do memory allocation just for a scalar.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
scalarBlock(TensorBlockScratch& scratch) const {
|
||||
void* mem = scratch.allocate(sizeof(Scalar));
|
||||
ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
|
||||
*buf = m_impl.coeff(0);
|
||||
|
||||
DSizes<Index, NumDims> dimensions;
|
||||
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
|
||||
|
||||
return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, buf,
|
||||
dimensions);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
|
||||
DSizes<Index, NumDims> dimensions;
|
||||
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
|
||||
return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
@ -901,6 +1201,73 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlockV2(
|
||||
const Dimensions& input_block_sizes,
|
||||
const Dimensions& input_block_strides,
|
||||
const BroadcastDimensions& bcast_block_sizes,
|
||||
const BroadcastDimensions& bcast_block_strides,
|
||||
const BroadcastDimensions& bcast_input_strides, Index offset,
|
||||
const TensorBlockDesc& output_desc, TensorBlockScratch& scratch,
|
||||
ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
|
||||
size_t* materialized_input_size) const {
|
||||
// ---------------------------------------------------------------------- //
|
||||
// Tensor block descriptor for reading block from the input.
|
||||
const Index input_offset = output_desc.offset() + offset;
|
||||
static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
TensorBlockDesc input_desc(is_col_major
|
||||
? indexColMajor(input_offset)
|
||||
: indexRowMajor(input_offset),
|
||||
input_block_sizes);
|
||||
|
||||
ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch);
|
||||
|
||||
// ---------------------------------------------------------------------- //
|
||||
// Materialize input block into a temporary memory buffer only if it's not
|
||||
// already available in the arg block.
|
||||
const ScalarNoConst* input_buffer = NULL;
|
||||
|
||||
if (input_block.data() != NULL) {
|
||||
// Input block already has raw data, there is no need to materialize it.
|
||||
input_buffer = input_block.data();
|
||||
|
||||
} else {
|
||||
// Otherwise we have to do block assignment into a temporary buffer.
|
||||
|
||||
// Maybe reuse previously allocated buffer, or allocate a new one with a
|
||||
// scratch allocator.
|
||||
const size_t input_total_size = input_block_sizes.TotalSize();
|
||||
if (*materialized_input == NULL ||
|
||||
*materialized_input_size < input_total_size) {
|
||||
*materialized_input_size = input_total_size;
|
||||
void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
|
||||
*materialized_input = static_cast<ScalarNoConst*>(mem);
|
||||
}
|
||||
|
||||
typedef internal::TensorBlockAssignment<
|
||||
ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
|
||||
TensorBlockAssignment;
|
||||
|
||||
typename TensorBlockAssignment::Dst assignment_dst(
|
||||
input_block_sizes, input_block_strides, *materialized_input);
|
||||
|
||||
TensorBlockAssignment::Run(assignment_dst, input_block.expr());
|
||||
|
||||
input_buffer = *materialized_input;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------- //
|
||||
// Copy data from materialized input block to the materialized output, using
|
||||
// given broadcast strides (strides with zeroes).
|
||||
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, 2 * NumDims, Layout>
|
||||
TensorBlockIOV2;
|
||||
|
||||
typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer);
|
||||
typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides,
|
||||
materialized_output + offset);
|
||||
|
||||
TensorBlockIOV2::Copy(dst, src);
|
||||
}
|
||||
|
||||
protected:
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
const typename internal::remove_reference<Broadcast>::type m_broadcast;
|
||||
|
@ -149,6 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = false,
|
||||
// Chipping of outer-most dimension is a trivial operation, because we can
|
||||
// read and write directly from the underlying tensor using single offset.
|
||||
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
|
||||
@ -169,6 +170,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
OutputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
|
||||
{
|
||||
|
@ -125,11 +125,16 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
|
||||
{
|
||||
@ -287,7 +292,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
@ -318,11 +323,16 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{
|
||||
|
@ -376,12 +376,17 @@ struct TensorContractionEvaluatorBase
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
|
@ -303,11 +303,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
|
||||
#endif
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
|
@ -310,12 +310,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
|
||||
{
|
||||
@ -783,12 +788,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
|
||||
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
|
||||
{
|
||||
|
@ -243,12 +243,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
|
||||
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
|
||||
{
|
||||
|
@ -96,12 +96,17 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<XprType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
|
||||
: m_op(op), m_device(device), m_result(NULL)
|
||||
{
|
||||
@ -265,12 +270,17 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_op(op), m_device(device), m_result(NULL)
|
||||
{
|
||||
|
@ -383,8 +383,17 @@ struct DSizes : array<DenseIndex, NumDims> {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename IndexType, int NumDims>
|
||||
std::ostream& operator<<(std::ostream& os,
|
||||
const DSizes<IndexType, NumDims>& dims) {
|
||||
os << "[";
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (i > 0) os << ", ";
|
||||
os << dims[i];
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
|
||||
// Boilerplate
|
||||
namespace internal {
|
||||
|
@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -124,6 +125,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
|
||||
|
||||
|
@ -43,13 +43,14 @@ struct TensorEvaluator
|
||||
internal::traits<Derived>::NumDimensions : 0;
|
||||
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
BlockAccessV2 = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<
|
||||
@ -62,9 +63,13 @@ struct TensorEvaluator
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
|
||||
m_dims(m.dimensions()),
|
||||
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
|
||||
m_dims(m.dimensions()),
|
||||
m_device(device)
|
||||
{ }
|
||||
|
||||
@ -162,6 +167,22 @@ struct TensorEvaluator
|
||||
TensorBlockWriter::Run(block, m_data);
|
||||
}
|
||||
|
||||
template<typename TensorBlockV2>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||
assert(m_data != NULL);
|
||||
|
||||
typedef typename TensorBlockV2::XprType TensorBlockExpr;
|
||||
typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
|
||||
Index>
|
||||
TensorBlockAssign;
|
||||
typename TensorBlockAssign::Dst dst(desc.dimensions(),
|
||||
internal::strides<Layout>(m_dims),
|
||||
m_data, desc.offset());
|
||||
|
||||
TensorBlockAssign::Run(dst, block.expr());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
@ -220,28 +241,43 @@ struct TensorEvaluator<const Derived, Device>
|
||||
typedef StorageMemory<const Scalar, Device> Storage;
|
||||
typedef typename Storage::Type EvaluatorPointerType;
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
// NumDimensions is -1 for variable dim tensors
|
||||
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
|
||||
internal::traits<Derived>::NumDimensions : 0;
|
||||
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
|
||||
BlockAccessV2 = internal::is_arithmetic<ScalarNoConst>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
|
||||
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlockIO;
|
||||
typedef typename TensorBlockIO::Dst TensorBlockIODst;
|
||||
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
|
||||
|
||||
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
|
||||
Layout, Index>
|
||||
TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
|
||||
{ }
|
||||
@ -310,6 +346,67 @@ struct TensorEvaluator<const Derived, Device>
|
||||
TensorBlockReader::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||
assert(m_data != NULL);
|
||||
|
||||
// TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
|
||||
|
||||
// If a tensor block descriptor covers a contiguous block of the underlying
|
||||
// memory, we can skip block buffer memory allocation, and construct a block
|
||||
// from existing `m_data` memory buffer.
|
||||
//
|
||||
// Example: (RowMajor layout)
|
||||
// m_dims: [11, 12, 13, 14]
|
||||
// desc.dimensions(): [1, 1, 3, 14]
|
||||
//
|
||||
// In this case we can construct a TensorBlock starting at
|
||||
// `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
|
||||
|
||||
static const bool
|
||||
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
|
||||
// Find out how many inner dimensions have a matching size.
|
||||
int num_matching_inner_dims = 0;
|
||||
for (int i = 0; i < NumCoords; ++i) {
|
||||
int dim = is_col_major ? i : NumCoords - i - 1;
|
||||
if (m_dims[dim] != desc.dimensions()[dim]) break;
|
||||
++num_matching_inner_dims;
|
||||
}
|
||||
|
||||
// All the outer dimensions must be of size `1`, except a single dimension
|
||||
// before the matching inner dimension (`3` in the example above).
|
||||
bool can_use_direct_access = true;
|
||||
for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
|
||||
int dim = is_col_major ? i : NumCoords - i - 1;
|
||||
if (desc.dimension(dim) != 1) {
|
||||
can_use_direct_access = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (can_use_direct_access) {
|
||||
EvaluatorPointerType block_start = m_data + desc.offset();
|
||||
return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
|
||||
desc.dimensions());
|
||||
|
||||
} else {
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
|
||||
ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
|
||||
|
||||
TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
|
||||
desc.offset());
|
||||
TensorBlockIODst dst(desc.dimensions(),
|
||||
internal::strides<Layout>(desc.dimensions()),
|
||||
block_buffer);
|
||||
|
||||
TensorBlockIO::Copy(dst, src);
|
||||
|
||||
return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
|
||||
block_buffer, desc.dimensions());
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
@ -355,12 +452,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
#endif
|
||||
,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
|
||||
@ -421,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<UnaryOp>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -446,6 +549,17 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
|
||||
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
|
||||
ArgTensorBlock;
|
||||
|
||||
typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
|
||||
TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
|
||||
@ -505,6 +619,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
arg_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||
return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
@ -537,6 +656,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
internal::functor_traits<BinaryOp>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
|
||||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
@ -571,6 +692,20 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
TensorEvaluator<LeftArgType, Device>::Layout>
|
||||
TensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
|
||||
typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlockV2
|
||||
LeftTensorBlock;
|
||||
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
|
||||
RightTensorBlock;
|
||||
|
||||
typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
|
||||
RightTensorBlock>
|
||||
TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use right impl instead if right impl dimensions are known at compile time.
|
||||
@ -642,6 +777,13 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
right_block.block_strides(), right_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||
desc.DropDestinationBuffer();
|
||||
return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
|
||||
m_rightImpl.blockV2(desc, scratch), m_functor);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
@ -670,6 +812,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
|
||||
internal::functor_traits<TernaryOp>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -709,6 +852,10 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
typedef StorageMemory<CoeffReturnType, Device> Storage;
|
||||
typedef typename Storage::Type EvaluatorPointerType;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use arg2 or arg3 dimensions if they are known at compile time.
|
||||
@ -780,6 +927,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
||||
PacketType<Scalar, Device>::HasBlend,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<IfArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -805,6 +953,10 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
typedef StorageMemory<CoeffReturnType, Device> Storage;
|
||||
typedef typename Storage::Type EvaluatorPointerType;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use then or else impl instead if they happen to be known at compile time.
|
||||
|
@ -23,7 +23,7 @@ namespace Eigen {
|
||||
*
|
||||
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
|
||||
* instructions)
|
||||
* @tparam Tileable can use block based tensor evaluation
|
||||
* @tparam Tiling can use block based tensor evaluation
|
||||
* (see TensorBlock.h)
|
||||
*/
|
||||
namespace internal {
|
||||
@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp<
|
||||
* Default strategy: the expression is evaluated sequentially with a single cpu
|
||||
* thread, without vectorization and block evaluation.
|
||||
*/
|
||||
#if EIGEN_HAS_CXX11
|
||||
template <typename Expression, typename Device, bool Vectorizable,
|
||||
bool Tileable>
|
||||
TiledEvaluation Tiling>
|
||||
#else
|
||||
template <typename Expression, typename Device, bool Vectorizable,
|
||||
TiledEvaluation::TiledEvaluation Tiling>
|
||||
#endif
|
||||
class TensorExecutor {
|
||||
public:
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
@ -109,8 +114,8 @@ class TensorAsyncExecutor {};
|
||||
* Process all the data with a single cpu thread, using vectorized instructions.
|
||||
*/
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
|
||||
/*Tileable*/ false> {
|
||||
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
|
||||
/*Tiling=*/TiledEvaluation::Off> {
|
||||
public:
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
|
||||
@ -152,7 +157,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
|
||||
*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tileable*/ true> {
|
||||
/*Tiling=*/TiledEvaluation::Legacy> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
@ -176,8 +181,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
if (total_size < cache_size
|
||||
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tileable*/ false>::run(expr, device);
|
||||
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
|
||||
evaluator.cleanup();
|
||||
return;
|
||||
}
|
||||
@ -211,6 +215,70 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Process all the data with a single cpu thread, using blocks of data. By
|
||||
* sizing a block to fit L1 cache we get better cache performance.
|
||||
*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::On> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
|
||||
typedef typename traits<Expression>::Index StorageIndex;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||
const DefaultDevice& device = DefaultDevice()) {
|
||||
typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
|
||||
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
|
||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||
|
||||
typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<DefaultDevice>
|
||||
TensorBlockScratch;
|
||||
|
||||
Evaluator evaluator(expr, device);
|
||||
Index total_size = array_prod(evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
// TODO(ezhulenev): Do not use tiling for small tensors?
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
|
||||
if (needs_assign) {
|
||||
// Size tensor blocks to fit in cache (or requested target block size).
|
||||
Index block_total_size = numext::mini(cache_size, total_size);
|
||||
TensorBlockShapeType block_shape = kSkewedInnerDims;
|
||||
// Query expression tree for desired block size/shape.
|
||||
std::vector<TensorOpResourceRequirements> resources;
|
||||
evaluator.getResourceRequirements(&resources);
|
||||
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||
|
||||
TensorBlockMapper block_mapper(
|
||||
TensorBlockDimensions(evaluator.dimensions()), block_shape,
|
||||
block_total_size);
|
||||
block_total_size = block_mapper.block_dims_total_size();
|
||||
|
||||
// Share scratch memory allocator between all blocks.
|
||||
TensorBlockScratch scratch(device);
|
||||
|
||||
const StorageIndex total_block_count = block_mapper.total_block_count();
|
||||
for (StorageIndex i = 0; i < total_block_count; ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
|
||||
|
||||
TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
|
||||
evaluator.evalBlockV2(desc, scratch);
|
||||
scratch.reset();
|
||||
}
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Multicore strategy: the index space is partitioned and each partition is
|
||||
* executed on a single core.
|
||||
@ -256,10 +324,11 @@ struct TensorExecutorTilingContext {
|
||||
};
|
||||
|
||||
// Computes a block evaluation parameters, and allocates temporary memory buffer
|
||||
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
|
||||
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
|
||||
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
|
||||
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
|
||||
const ThreadPoolDevice& device, const Evaluator& evaluator) {
|
||||
const ThreadPoolDevice& device, const Evaluator& evaluator,
|
||||
bool allocate_buffer = true) {
|
||||
// Prefer blocks skewed toward inner dimension.
|
||||
TensorBlockShapeType block_shape = kSkewedInnerDims;
|
||||
Index block_total_size = 0;
|
||||
@ -284,7 +353,13 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
|
||||
const size_t aligned_blocksize =
|
||||
align *
|
||||
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
|
||||
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||
|
||||
// TODO(ezhulenev): In new block evaluation framework there is no need for
|
||||
// allocating temporary buffers, remove this after migration.
|
||||
void* buf = NULL;
|
||||
if (allocate_buffer) {
|
||||
buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||
}
|
||||
|
||||
return {block_mapper, cost * block_size, buf, aligned_blocksize};
|
||||
}
|
||||
@ -344,8 +419,8 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
||||
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
|
||||
public:
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
|
||||
@ -369,7 +444,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::Legacy> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Index StorageIndex;
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
@ -387,11 +463,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
||||
Index total_size = array_prod(evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
if (total_size < cache_size
|
||||
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
if (total_size < cache_size &&
|
||||
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tileable*/ false>::run(expr, device);
|
||||
/*Tiling=*/TiledEvaluation::Off>::run(expr,
|
||||
device);
|
||||
evaluator.cleanup();
|
||||
return;
|
||||
}
|
||||
@ -419,6 +496,57 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::On> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Index IndexType;
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
|
||||
Evaluator::Layout>
|
||||
BlockMapper;
|
||||
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
||||
|
||||
typedef internal::TensorBlockDescriptor<NumDims, IndexType>
|
||||
TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
|
||||
TensorBlockScratch;
|
||||
|
||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||
const ThreadPoolDevice& device) {
|
||||
Evaluator evaluator(expr, device);
|
||||
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||
if (needs_assign) {
|
||||
const TilingContext tiling =
|
||||
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
||||
Vectorizable>(
|
||||
device, evaluator, /*allocate_buffer=*/false);
|
||||
|
||||
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
|
||||
IndexType lastBlockIdx) {
|
||||
TensorBlockScratch scratch(device);
|
||||
|
||||
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
|
||||
auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
|
||||
TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
|
||||
evaluator.evalBlockV2(desc, scratch);
|
||||
scratch.reset();
|
||||
}
|
||||
};
|
||||
|
||||
device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost,
|
||||
eval_block);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, typename DoneCallback, bool Vectorizable,
|
||||
bool Tileable>
|
||||
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||
@ -562,8 +690,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
||||
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
|
||||
public:
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
|
||||
}
|
||||
|
||||
/*static*/
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
|
||||
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
|
||||
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
|
||||
const Expression& expr, const GpuDevice& device) {
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||
@ -711,8 +839,8 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
|
||||
range_, vectorizable_threads_, evaluator) {}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
|
||||
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
|
||||
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
|
||||
|
@ -138,12 +138,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
IsAligned = false,
|
||||
PacketAccess = true,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
|
@ -42,12 +42,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
|
||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||
CoordAccess = true,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
typedef Dimensions_ Dimensions;
|
||||
static const std::size_t NumIndices = Dimensions::count;
|
||||
|
||||
|
@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = true
|
||||
@ -109,8 +110,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_op(op.expression()),
|
||||
: m_impl(op.expression(), device), m_op(op.expression()),
|
||||
m_device(device), m_buffer(NULL)
|
||||
{ }
|
||||
|
||||
@ -132,13 +137,13 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
#endif
|
||||
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
||||
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
|
||||
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
|
||||
const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
|
||||
TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
|
||||
|
||||
internal::TensorExecutor<const EvalTo,
|
||||
typename internal::remove_const<Device>::type,
|
||||
Vectorize, Tile>::run(evalToTmp, m_device);
|
||||
internal::TensorExecutor<
|
||||
const EvalTo, typename internal::remove_const<Device>::type,
|
||||
/*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
|
||||
/*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
|
||||
run(evalToTmp, m_device);
|
||||
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
|
@ -154,23 +154,61 @@ struct IsVectorizable<GpuDevice, Expression> {
|
||||
TensorEvaluator<Expression, GpuDevice>::IsAligned;
|
||||
};
|
||||
|
||||
// Tiled evaluation strategy.
|
||||
#if !EIGEN_HAS_CXX11
|
||||
// To be able to use `TiledEvaluation::Off` in C++03 we need a namespace.
|
||||
// (Use of enumeration in a nested name specifier is a c++11 extension).
|
||||
namespace TiledEvaluation {
|
||||
#endif
|
||||
enum TiledEvaluation {
|
||||
Off = 0, // tiled evaluation is not supported
|
||||
On = 1, // still work in progress (see TensorBlockV2.h)
|
||||
Legacy = 2 // soon to be deprecated (see TensorBock.h)
|
||||
};
|
||||
#if !EIGEN_HAS_CXX11
|
||||
} // namespace TiledEvaluation
|
||||
#endif
|
||||
|
||||
template <typename Device, typename Expression>
|
||||
struct IsTileable {
|
||||
#if !EIGEN_HAS_CXX11
|
||||
typedef TiledEvaluation::TiledEvaluation TiledEvaluation;
|
||||
#endif
|
||||
|
||||
// Check that block evaluation is supported and it's a preferred option (at
|
||||
// least one sub-expression has much faster block evaluation, e.g.
|
||||
// broadcasting).
|
||||
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
|
||||
TensorEvaluator<Expression, Device>::PreferBlockAccess;
|
||||
static const bool BlockAccess =
|
||||
TensorEvaluator<Expression, Device>::BlockAccess &&
|
||||
TensorEvaluator<Expression, Device>::PreferBlockAccess;
|
||||
|
||||
static const bool BlockAccessV2 =
|
||||
TensorEvaluator<Expression, Device>::BlockAccessV2 &&
|
||||
TensorEvaluator<Expression, Device>::PreferBlockAccess;
|
||||
|
||||
|
||||
static const TiledEvaluation value =
|
||||
BlockAccessV2
|
||||
? TiledEvaluation::On
|
||||
: (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
|
||||
};
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
template <typename Expression, typename Device,
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
|
||||
class TensorExecutor;
|
||||
#else
|
||||
template <typename Expression, typename Device,
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||
bool Tileable = IsTileable<Device, Expression>::value>
|
||||
TiledEvaluation::TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
|
||||
class TensorExecutor;
|
||||
#endif
|
||||
|
||||
// TODO(ezhulenev): Add TiledEvaluation support to async executor.
|
||||
template <typename Expression, typename Device, typename DoneCallback,
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||
bool Tileable = IsTileable<Device, Expression>::value>
|
||||
bool Tileable = IsTileable<Device, Expression>::BlockAccess>
|
||||
class TensorAsyncExecutor;
|
||||
|
||||
} // end namespace internal
|
||||
|
@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -105,6 +106,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_device(device), m_generator(op.generator())
|
||||
{
|
||||
|
@ -232,6 +232,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
@ -241,6 +242,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
||||
OutputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
|
||||
: m_device(device), m_impl(op.expression(), device)
|
||||
{
|
||||
|
@ -93,12 +93,17 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_strides(op.strides())
|
||||
{
|
||||
|
@ -120,12 +120,17 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
@ -195,11 +200,16 @@ template<typename ArgType, typename Device>
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false // to be implemented
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
@ -121,6 +121,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
|
||||
TensorEvaluator<ArgType, Device>::RawAccess &&
|
||||
NumInputDims > 0 && NumOutputDims > 0,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -137,6 +138,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
Layout>
|
||||
OutputTensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
|
||||
{
|
||||
@ -363,6 +368,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -378,6 +384,10 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(index);
|
||||
@ -532,6 +542,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
@ -543,6 +554,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
|
||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
|
||||
{
|
||||
@ -813,6 +828,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
@ -824,6 +840,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
|
||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
@ -1002,11 +1022,16 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device),
|
||||
m_device(device),
|
||||
@ -1179,12 +1204,17 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
@ -99,12 +99,17 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = true,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
|
||||
{
|
||||
|
@ -97,12 +97,17 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
|
@ -585,6 +585,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
IsAligned = false,
|
||||
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -598,6 +599,10 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
|
||||
InputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||
static const bool RunningFullReduction = (NumOutputDims==0);
|
||||
|
@ -142,12 +142,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = PlainObjectType::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
|
||||
}
|
||||
|
||||
@ -374,12 +379,17 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorRef<Derived>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
|
||||
: m_ref(m)
|
||||
{ }
|
||||
@ -401,7 +411,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
|
||||
|
||||
|
||||
protected:
|
||||
TensorRef<Derived> m_ref;
|
||||
};
|
||||
@ -423,10 +433,15 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
|
||||
{ }
|
||||
|
||||
|
@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -128,6 +129,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
OutputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: m_impl(op.expression(), device),
|
||||
@ -400,6 +405,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -413,6 +419,10 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
|
@ -100,12 +100,17 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: m_impl(op.expression(), device),
|
||||
|
@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
@ -129,6 +130,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: m_device(device),
|
||||
@ -426,6 +431,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
@ -438,6 +444,10 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
@ -115,12 +115,17 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
|
@ -98,12 +98,17 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_traceDim(1), m_device(device)
|
||||
{
|
||||
|
@ -184,12 +184,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
|
||||
m_impl(op.expression(), device)
|
||||
{
|
||||
|
@ -119,6 +119,7 @@ ei_add_test(cxx11_tensor_dimension)
|
||||
ei_add_test(cxx11_tensor_map)
|
||||
ei_add_test(cxx11_tensor_assign)
|
||||
ei_add_test(cxx11_tensor_block_access)
|
||||
ei_add_test(cxx11_tensor_broadcasting)
|
||||
ei_add_test(cxx11_tensor_comparisons)
|
||||
ei_add_test(cxx11_tensor_forced_eval)
|
||||
ei_add_test(cxx11_tensor_math)
|
||||
@ -188,8 +189,9 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test(cxx11_tensor_of_complex)
|
||||
ei_add_test(cxx11_tensor_of_strings)
|
||||
ei_add_test(cxx11_tensor_lvalue)
|
||||
ei_add_test(cxx11_tensor_broadcasting)
|
||||
ei_add_test(cxx11_tensor_chipping)
|
||||
ei_add_test(cxx11_tensor_block_eval)
|
||||
ei_add_test(cxx11_tensor_block_io)
|
||||
ei_add_test(cxx11_tensor_concatenation)
|
||||
ei_add_test(cxx11_tensor_inflation)
|
||||
ei_add_test(cxx11_tensor_morphing)
|
||||
|
@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
|
||||
}
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
tensor.resize(11,3,5);
|
||||
#else
|
||||
array<Index, 3> new_dims;
|
||||
new_dims[0] = 11;
|
||||
new_dims[1] = 3;
|
||||
new_dims[2] = 5;
|
||||
tensor.resize(new_dims);
|
||||
#endif
|
||||
|
||||
tensor.setRandom();
|
||||
broadcast = tensor.broadcast(broadcasts);
|
||||
|
||||
@ -139,7 +148,16 @@ static void test_static_broadcasting()
|
||||
}
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
tensor.resize(11,3,5);
|
||||
#else
|
||||
array<Index, 3> new_dims;
|
||||
new_dims[0] = 11;
|
||||
new_dims[1] = 3;
|
||||
new_dims[2] = 5;
|
||||
tensor.resize(new_dims);
|
||||
#endif
|
||||
|
||||
tensor.setRandom();
|
||||
broadcast = tensor.broadcast(broadcasts);
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
using Eigen::Tensor;
|
||||
using Eigen::RowMajor;
|
||||
using Eigen::ColMajor;
|
||||
using Eigen::internal::TiledEvaluation;
|
||||
|
||||
// A set of tests to verify that different TensorExecutor strategies yields the
|
||||
// same results for all the ops, supporting tiled evaluation.
|
||||
@ -30,7 +31,7 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_unary_expr(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -47,7 +48,7 @@ static void test_execute_unary_expr(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -58,7 +59,7 @@ static void test_execute_unary_expr(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_binary_expr(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -78,7 +79,7 @@ static void test_execute_binary_expr(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -89,7 +90,7 @@ static void test_execute_binary_expr(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_broadcasting(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -111,7 +112,7 @@ static void test_execute_broadcasting(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -121,7 +122,7 @@ static void test_execute_broadcasting(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_chipping_rvalue(Device d)
|
||||
{
|
||||
auto dims = RandomDims<NumDims>(1, 10);
|
||||
@ -140,7 +141,7 @@ static void test_execute_chipping_rvalue(Device d)
|
||||
\
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
|
||||
using Executor = internal::TensorExecutor<const Assign, Device, \
|
||||
Vectorizable, Tileable>; \
|
||||
Vectorizable, Tiling>; \
|
||||
\
|
||||
Executor::run(Assign(dst, expr), d); \
|
||||
\
|
||||
@ -160,7 +161,7 @@ static void test_execute_chipping_rvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_chipping_lvalue(Device d)
|
||||
{
|
||||
auto dims = RandomDims<NumDims>(1, 10);
|
||||
@ -193,7 +194,7 @@ static void test_execute_chipping_lvalue(Device d)
|
||||
\
|
||||
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
|
||||
using Executor = internal::TensorExecutor<const Assign, Device, \
|
||||
Vectorizable, Tileable>; \
|
||||
Vectorizable, Tiling>; \
|
||||
\
|
||||
Executor::run(Assign(expr, src), d); \
|
||||
\
|
||||
@ -213,7 +214,7 @@ static void test_execute_chipping_lvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_shuffle_rvalue(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -239,7 +240,7 @@ static void test_execute_shuffle_rvalue(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -249,7 +250,7 @@ static void test_execute_shuffle_rvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_shuffle_lvalue(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -278,7 +279,7 @@ static void test_execute_shuffle_lvalue(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(expr, src), d);
|
||||
|
||||
@ -288,7 +289,7 @@ static void test_execute_shuffle_lvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_reduction(Device d)
|
||||
{
|
||||
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
||||
@ -320,7 +321,7 @@ static void test_execute_reduction(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -330,7 +331,7 @@ static void test_execute_reduction(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_reshape(Device d)
|
||||
{
|
||||
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
||||
@ -360,7 +361,7 @@ static void test_execute_reshape(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -370,7 +371,7 @@ static void test_execute_reshape(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_slice_rvalue(Device d)
|
||||
{
|
||||
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
||||
@ -400,7 +401,7 @@ static void test_execute_slice_rvalue(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -410,7 +411,7 @@ static void test_execute_slice_rvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_slice_lvalue(Device d)
|
||||
{
|
||||
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
||||
@ -443,7 +444,7 @@ static void test_execute_slice_lvalue(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(expr, slice), d);
|
||||
|
||||
@ -453,7 +454,7 @@ static void test_execute_slice_lvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_broadcasting_of_forced_eval(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -475,7 +476,7 @@ static void test_execute_broadcasting_of_forced_eval(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -497,7 +498,7 @@ struct DummyGenerator {
|
||||
};
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_generator_op(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -518,7 +519,7 @@ static void test_execute_generator_op(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -528,7 +529,7 @@ static void test_execute_generator_op(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_reverse_rvalue(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -553,7 +554,7 @@ static void test_execute_reverse_rvalue(Device d)
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
@ -563,7 +564,7 @@ static void test_execute_reverse_rvalue(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_async_execute_unary_expr(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -584,7 +585,7 @@ static void test_async_execute_unary_expr(Device d)
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using DoneCallback = decltype(on_done);
|
||||
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
||||
Vectorizable, Tileable>;
|
||||
Vectorizable, Tiling>;
|
||||
|
||||
Executor::runAsync(Assign(dst, expr), d, on_done);
|
||||
done.Wait();
|
||||
@ -596,7 +597,7 @@ static void test_async_execute_unary_expr(Device d)
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
bool Tileable, int Layout>
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_async_execute_binary_expr(Device d)
|
||||
{
|
||||
static constexpr int Options = 0 | Layout;
|
||||
@ -620,7 +621,7 @@ static void test_async_execute_binary_expr(Device d)
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using DoneCallback = decltype(on_done);
|
||||
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
|
||||
Vectorizable, Tileable>;
|
||||
Vectorizable, Tiling>;
|
||||
|
||||
Executor::runAsync(Assign(dst, expr), d, on_done);
|
||||
done.Wait();
|
||||
@ -640,34 +641,57 @@ static void test_async_execute_binary_expr(Device d)
|
||||
#define CALL_SUBTEST_PART(PART) \
|
||||
CALL_SUBTEST_##PART
|
||||
|
||||
#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
|
||||
#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
|
||||
// NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
|
||||
#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
|
||||
// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
|
||||
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
|
||||
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
Eigen::DefaultDevice default_device;
|
||||
@ -678,69 +702,69 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
Eigen::ThreadPool tp(num_threads);
|
||||
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 1);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 5);
|
||||
|
||||
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
|
||||
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
|
||||
@ -754,4 +778,3 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
|
||||
}
|
||||
|
||||
#undef CALL_SUBTEST_COMBINATIONS
|
||||
|
Loading…
x
Reference in New Issue
Block a user