mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Remove legacy block evaluation support
This commit is contained in:
parent
71aa53dd6d
commit
13c3327f5c
@ -88,7 +88,6 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -230,7 +229,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
|
||||
|
@ -108,7 +108,6 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
|
||||
|
@ -108,8 +108,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
|
||||
@ -216,19 +214,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
m_rightImpl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
|
||||
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
|
||||
m_leftImpl.data() != NULL) {
|
||||
TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
|
||||
block->tensor_strides(), block->tensor_strides(),
|
||||
m_leftImpl.data() + block->first_coeff_index());
|
||||
m_rightImpl.block(&left_block);
|
||||
} else {
|
||||
m_rightImpl.block(block);
|
||||
m_leftImpl.writeBlock(*block);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
|
||||
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
|
||||
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
|
||||
|
@ -142,782 +142,6 @@ class TensorBlock {
|
||||
Scalar* m_data; // Not owned.
|
||||
};
|
||||
|
||||
template <typename Scalar, typename StorageIndex>
|
||||
struct TensorBlockCopyOp {
|
||||
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
enum {
|
||||
Vectorizable = packet_traits<Scalar>::Vectorizable,
|
||||
PacketSize = packet_traits<Scalar>::size
|
||||
};
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
|
||||
const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
||||
const StorageIndex src_index, const StorageIndex src_stride,
|
||||
const Scalar* EIGEN_RESTRICT src_data) {
|
||||
const Scalar* src = &src_data[src_index];
|
||||
Scalar* dst = &dst_data[dst_index];
|
||||
|
||||
if (!Vectorizable) {
|
||||
for (Index i = 0; i < num_coeff_to_copy; ++i) {
|
||||
dst[i * dst_stride] = src[i * src_stride];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (src_stride == 1) {
|
||||
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||
if (dst_stride == 1) {
|
||||
// LINEAR
|
||||
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||
Packet p = ploadu<Packet>(src + i);
|
||||
pstoreu<Scalar, Packet>(dst + i, p);
|
||||
}
|
||||
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
} else {
|
||||
// SCATTER
|
||||
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||
Packet p = ploadu<Packet>(src + i);
|
||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||
}
|
||||
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||
dst[i * dst_stride] = src[i];
|
||||
}
|
||||
}
|
||||
} else if (src_stride == 0) {
|
||||
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||
if (dst_stride == 1) {
|
||||
// LINEAR
|
||||
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||
Packet p = pload1<Packet>(src);
|
||||
pstoreu<Scalar, Packet>(dst + i, p);
|
||||
}
|
||||
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||
dst[i] = *src;
|
||||
}
|
||||
} else {
|
||||
// SCATTER
|
||||
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||
Packet p = pload1<Packet>(src);
|
||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||
}
|
||||
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||
dst[i * dst_stride] = *src;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (dst_stride == 1) {
|
||||
// GATHER
|
||||
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
||||
pstoreu<Scalar, Packet>(dst + i, p);
|
||||
}
|
||||
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||
dst[i] = src[i * src_stride];
|
||||
}
|
||||
} else {
|
||||
// RANDOM
|
||||
for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) {
|
||||
dst[i * dst_stride] = src[i * src_stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockIO
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block IO class.
|
||||
*
|
||||
* This class is responsible for copying data between a tensor and a tensor
|
||||
* block.
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
|
||||
bool BlockRead>
|
||||
class TensorBlockIO {
|
||||
public:
|
||||
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
|
||||
typedef TensorBlockCopyOp<Scalar, StorageIndex> BlockCopyOp;
|
||||
|
||||
protected:
|
||||
typedef array<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
struct BlockIteratorState {
|
||||
StorageIndex input_stride;
|
||||
StorageIndex output_stride;
|
||||
StorageIndex input_span;
|
||||
StorageIndex output_span;
|
||||
StorageIndex size;
|
||||
StorageIndex count;
|
||||
BlockIteratorState()
|
||||
: input_stride(0),
|
||||
output_stride(0),
|
||||
input_span(0),
|
||||
output_span(0),
|
||||
size(0),
|
||||
count(0) {}
|
||||
};
|
||||
|
||||
// Compute how many inner dimensions it's allowed to squeeze when doing IO
|
||||
// between a tensor and a block. It's safe to squeeze inner dimensions, only
|
||||
// if they are not reordered.
|
||||
static int NumSqueezableInnerDims(const Dimensions& tensor_to_block_dim_map) {
|
||||
int num_squeezable_dims = 0;
|
||||
if (Layout == ColMajor) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
|
||||
else break;
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
|
||||
else break;
|
||||
}
|
||||
}
|
||||
return num_squeezable_dims;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
|
||||
const Block& block, StorageIndex first_coeff_index,
|
||||
const Dimensions& tensor_to_block_dim_map,
|
||||
const Dimensions& tensor_strides,
|
||||
const Scalar* src_data,
|
||||
Scalar* dst_data) {
|
||||
// Do not squeeze reordered inner dimensions.
|
||||
int num_squeezable_dims = NumSqueezableInnerDims(tensor_to_block_dim_map);
|
||||
|
||||
// Find the innermost tensor dimension whose size is not 1. This is the
|
||||
// effective inner dim. If all dimensions are of size 1, then fallback to
|
||||
// using the actual innermost dim to avoid out-of-bound access.
|
||||
StorageIndex num_size_one_inner_dims = 0;
|
||||
for (int i = 0; i < num_squeezable_dims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
|
||||
num_size_one_inner_dims = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate strides and dimensions.
|
||||
const StorageIndex tensor_stride1_dim = cond<Layout>()(
|
||||
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
||||
const StorageIndex block_dim_for_tensor_stride1_dim =
|
||||
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
||||
StorageIndex block_inner_dim_size =
|
||||
NumDims == 0 ? 1
|
||||
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
||||
|
||||
// Squeeze multiple inner dims into one for larger inner dim size.
|
||||
for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
|
||||
const Index dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
const StorageIndex block_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
if (block_inner_dim_size == block_stride &&
|
||||
block_stride == tensor_strides[dim]) {
|
||||
block_inner_dim_size *=
|
||||
block.block_sizes()[tensor_to_block_dim_map[dim]];
|
||||
++num_size_one_inner_dims;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StorageIndex inputIndex;
|
||||
StorageIndex outputIndex;
|
||||
StorageIndex input_stride;
|
||||
StorageIndex output_stride;
|
||||
|
||||
// Setup strides to read/write along the tensor's stride1 dimension.
|
||||
if (BlockRead) {
|
||||
inputIndex = first_coeff_index;
|
||||
outputIndex = 0;
|
||||
input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
|
||||
output_stride =
|
||||
NumDims == 0
|
||||
? 1
|
||||
: block.block_strides()[block_dim_for_tensor_stride1_dim];
|
||||
} else {
|
||||
inputIndex = 0;
|
||||
outputIndex = first_coeff_index;
|
||||
input_stride =
|
||||
NumDims == 0
|
||||
? 1
|
||||
: block.block_strides()[block_dim_for_tensor_stride1_dim];
|
||||
output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
|
||||
}
|
||||
|
||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||
|
||||
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||
Index num_squeezed_dims = 0;
|
||||
for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||
const Index dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||
const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
|
||||
if (size == 1) {
|
||||
continue;
|
||||
}
|
||||
block_iter_state[num_squeezed_dims].size = size;
|
||||
if (BlockRead) {
|
||||
block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
|
||||
block_iter_state[num_squeezed_dims].output_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
} else {
|
||||
block_iter_state[num_squeezed_dims].input_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
|
||||
}
|
||||
block_iter_state[num_squeezed_dims].input_span =
|
||||
block_iter_state[num_squeezed_dims].input_stride *
|
||||
(block_iter_state[num_squeezed_dims].size - 1);
|
||||
block_iter_state[num_squeezed_dims].output_span =
|
||||
block_iter_state[num_squeezed_dims].output_stride *
|
||||
(block_iter_state[num_squeezed_dims].size - 1);
|
||||
++num_squeezed_dims;
|
||||
}
|
||||
|
||||
// Iterate copying data from src to dst.
|
||||
const StorageIndex block_total_size =
|
||||
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
|
||||
for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
|
||||
BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
|
||||
dst_data, inputIndex, input_stride, src_data);
|
||||
// Update index.
|
||||
for (int j = 0; j < num_squeezed_dims; ++j) {
|
||||
if (++block_iter_state[j].count < block_iter_state[j].size) {
|
||||
inputIndex += block_iter_state[j].input_stride;
|
||||
outputIndex += block_iter_state[j].output_stride;
|
||||
break;
|
||||
}
|
||||
block_iter_state[j].count = 0;
|
||||
inputIndex -= block_iter_state[j].input_span;
|
||||
outputIndex -= block_iter_state[j].output_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockReader
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block reader class.
|
||||
*
|
||||
* This class is responsible for reading a tensor block.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||
Layout, /*BlockRead=*/true> {
|
||||
public:
|
||||
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
|
||||
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> Base;
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
Block* block, const Scalar* src_data) {
|
||||
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_to_block_dim_map[i] = i;
|
||||
}
|
||||
Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
|
||||
block->tensor_strides(), src_data, block->data());
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
Block* block, StorageIndex first_coeff_index,
|
||||
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
|
||||
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
|
||||
tensor_strides, src_data, block->data());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockWriter
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block writer class.
|
||||
*
|
||||
* This class is responsible for writing a tensor block.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||
Layout, /*BlockRead=*/false> {
|
||||
public:
|
||||
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
|
||||
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> Base;
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const Block& block, Scalar* dst_data) {
|
||||
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_to_block_dim_map[i] = i;
|
||||
}
|
||||
Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
|
||||
block.tensor_strides(), block.data(), dst_data);
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const Block& block, StorageIndex first_coeff_index,
|
||||
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||
const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
|
||||
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
|
||||
tensor_strides, block.data(), dst_data);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseUnaryOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Carries out a cwise binary op on a number of coefficients.
|
||||
*
|
||||
* This class reads strided input from the argument, and writes the
|
||||
* result of the cwise unary op to the strided output array.
|
||||
*
|
||||
*/
|
||||
template <bool Vectorizable>
|
||||
struct TensorBlockCwiseUnaryOp {
|
||||
template <typename StorageIndex, typename UnaryFunctor,
|
||||
typename OutputScalar, typename InputScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const UnaryFunctor& functor, const StorageIndex num_coeff,
|
||||
const StorageIndex output_index, const StorageIndex output_stride,
|
||||
OutputScalar* output_data, const StorageIndex input_index,
|
||||
const StorageIndex input_stride, const InputScalar* input_data) {
|
||||
typedef const Array<InputScalar, Dynamic, 1> Input;
|
||||
typedef Array<OutputScalar, Dynamic, 1> Output;
|
||||
|
||||
typedef Map<Input, 0, InnerStride<> > InputMap;
|
||||
typedef Map<Output, 0, InnerStride<> > OutputMap;
|
||||
|
||||
const InputScalar* input_base = &input_data[input_index];
|
||||
OutputScalar* output_base = &output_data[output_index];
|
||||
|
||||
const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
|
||||
OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
|
||||
|
||||
output = CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct TensorBlockCwiseUnaryOp<true> {
|
||||
template <typename StorageIndex, typename UnaryFunctor,
|
||||
typename OutputScalar, typename InputScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const UnaryFunctor& functor, const StorageIndex num_coeff,
|
||||
const StorageIndex output_index, const StorageIndex output_stride,
|
||||
OutputScalar* output_data, const StorageIndex input_index,
|
||||
const StorageIndex input_stride, const InputScalar* input_data) {
|
||||
if (input_stride == 1 && output_stride == 1) {
|
||||
typedef const Array<InputScalar, Dynamic, 1> Input;
|
||||
typedef Array<OutputScalar, Dynamic, 1> Output;
|
||||
|
||||
const Map<Input> input(&input_data[input_index], num_coeff);
|
||||
Map<Output> output(&output_data[output_index], num_coeff);
|
||||
|
||||
output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
|
||||
} else {
|
||||
TensorBlockCwiseUnaryOp<false>::Run(
|
||||
functor, num_coeff, output_index, output_stride, output_data,
|
||||
input_index, input_stride, input_data);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseUnaryIO
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block IO class for carrying out cwise unary ops.
|
||||
*
|
||||
* This class carries out the unary op on given blocks.
|
||||
*/
|
||||
template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
|
||||
int NumDims, int Layout>
|
||||
struct TensorBlockCwiseUnaryIO {
|
||||
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
|
||||
Layout>::Dimensions Dimensions;
|
||||
|
||||
typedef TensorBlockCwiseUnaryOp<
|
||||
packet_traits<OutputScalar>::Vectorizable &&
|
||||
functor_traits<UnaryFunctor>::PacketAccess>
|
||||
TensorBlockCwiseUnaryOpImpl;
|
||||
|
||||
struct BlockIteratorState {
|
||||
StorageIndex output_stride, output_span;
|
||||
StorageIndex input_stride, input_span;
|
||||
StorageIndex size, count;
|
||||
};
|
||||
|
||||
template <typename InputScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const UnaryFunctor& functor, const Dimensions& block_sizes,
|
||||
const Dimensions& block_strides, OutputScalar* output_data,
|
||||
const array<StorageIndex, NumDims>& input_strides,
|
||||
const InputScalar* input_data) {
|
||||
// Find the innermost dimension whose size is not 1. This is the effective
|
||||
// inner dim. If all dimensions are of size 1, fallback to using the actual
|
||||
// innermost dim to avoid out-of-bound access.
|
||||
int num_size_one_inner_dims = 0;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block_sizes[dim] != 1) {
|
||||
num_size_one_inner_dims = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Calculate strides and dimensions.
|
||||
const int inner_dim =
|
||||
NumDims == 0 ? 1
|
||||
: cond<Layout>()(num_size_one_inner_dims,
|
||||
NumDims - num_size_one_inner_dims - 1);
|
||||
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
|
||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
// Merge multiple inner dims into one for larger inner dim size (i.e.
|
||||
// fewer calls to TensorBlockCwiseUnaryOp::Run()).
|
||||
if (inner_dim_size == block_strides[dim] &&
|
||||
block_strides[dim] == input_strides[dim]) {
|
||||
inner_dim_size *= block_sizes[dim];
|
||||
++num_size_one_inner_dims;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StorageIndex output_index = 0, input_index = 0;
|
||||
|
||||
const StorageIndex output_stride =
|
||||
NumDims == 0 ? 1 : block_strides[inner_dim];
|
||||
const StorageIndex input_stride =
|
||||
NumDims == 0 ? 1 : input_strides[inner_dim];
|
||||
|
||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||
|
||||
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||
int num_squeezed_dims = 0;
|
||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||
const StorageIndex size = block_sizes[dim];
|
||||
if (size == 1) {
|
||||
continue;
|
||||
}
|
||||
BlockIteratorState& state = block_iter_state[num_squeezed_dims];
|
||||
state.output_stride = block_strides[dim];
|
||||
state.input_stride = input_strides[dim];
|
||||
state.size = size;
|
||||
state.output_span = state.output_stride * (size - 1);
|
||||
state.input_span = state.input_stride * (size - 1);
|
||||
state.count = 0;
|
||||
++num_squeezed_dims;
|
||||
}
|
||||
|
||||
// Compute cwise unary op.
|
||||
const StorageIndex block_total_size =
|
||||
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||
TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
|
||||
output_stride, output_data, input_index,
|
||||
input_stride, input_data);
|
||||
// Update index.
|
||||
for (int j = 0; j < num_squeezed_dims; ++j) {
|
||||
BlockIteratorState& state = block_iter_state[j];
|
||||
if (++state.count < state.size) {
|
||||
output_index += state.output_stride;
|
||||
input_index += state.input_stride;
|
||||
break;
|
||||
}
|
||||
state.count = 0;
|
||||
output_index -= state.output_span;
|
||||
input_index -= state.input_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseBinaryOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Carries out a cwise binary op on a number of coefficients.
|
||||
*
|
||||
* This class reads strided inputs from left and right operands, and writes the
|
||||
* result of the cwise binary op to the strided output array.
|
||||
*
|
||||
*/
|
||||
template<bool Vectorizable>
|
||||
struct TensorBlockCwiseBinaryOp {
|
||||
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||
typename LeftScalar, typename RightScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const BinaryFunctor& functor, const StorageIndex num_coeff,
|
||||
const StorageIndex output_index, const StorageIndex output_stride,
|
||||
OutputScalar* output_data, const StorageIndex left_index,
|
||||
const StorageIndex left_stride, const LeftScalar* left_data,
|
||||
const StorageIndex right_index, const StorageIndex right_stride,
|
||||
const RightScalar* right_data) {
|
||||
typedef const Array<LeftScalar, Dynamic, 1> Lhs;
|
||||
typedef const Array<RightScalar, Dynamic, 1> Rhs;
|
||||
typedef Array<OutputScalar, Dynamic, 1> Out;
|
||||
|
||||
typedef Map<Lhs, 0, InnerStride<> > LhsMap;
|
||||
typedef Map<Rhs, 0, InnerStride<> > RhsMap;
|
||||
typedef Map<Out, 0, InnerStride<> > OutMap;
|
||||
|
||||
const LeftScalar* lhs_base = &left_data[left_index];
|
||||
const RightScalar* rhs_base = &right_data[right_index];
|
||||
OutputScalar* out_base = &output_data[output_index];
|
||||
|
||||
const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
|
||||
const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
|
||||
OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
|
||||
|
||||
out = CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct TensorBlockCwiseBinaryOp<true> {
|
||||
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||
typename LeftScalar, typename RightScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const BinaryFunctor& functor, const StorageIndex num_coeff,
|
||||
const StorageIndex output_index, const StorageIndex output_stride,
|
||||
OutputScalar* output_data, const StorageIndex left_index,
|
||||
const StorageIndex left_stride, const LeftScalar* left_data,
|
||||
const StorageIndex right_index, const StorageIndex right_stride,
|
||||
const RightScalar* right_data) {
|
||||
if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
|
||||
typedef const Array<LeftScalar, Dynamic, 1> Lhs;
|
||||
typedef const Array<RightScalar, Dynamic, 1> Rhs;
|
||||
typedef Array<OutputScalar, Dynamic, 1> Out;
|
||||
|
||||
const LeftScalar* lhs_base = &left_data[left_index];
|
||||
const RightScalar* rhs_base = &right_data[right_index];
|
||||
OutputScalar* out_base = &output_data[output_index];
|
||||
|
||||
const Map<Lhs> lhs(lhs_base, num_coeff);
|
||||
const Map<Rhs> rhs(rhs_base, num_coeff);
|
||||
Map<Out> out(out_base, num_coeff);
|
||||
|
||||
out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
|
||||
} else {
|
||||
TensorBlockCwiseBinaryOp<false>::Run(
|
||||
functor, num_coeff, output_index, output_stride, output_data,
|
||||
left_index, left_stride, left_data, right_index, right_stride,
|
||||
right_data);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseBinaryIO
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block IO class for carrying out cwise binary ops.
|
||||
*
|
||||
* This class carries out the binary op on given blocks.
|
||||
*
|
||||
*/
|
||||
template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
|
||||
int NumDims, int Layout>
|
||||
struct TensorBlockCwiseBinaryIO {
|
||||
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
|
||||
|
||||
typedef TensorBlockCwiseBinaryOp<
|
||||
packet_traits<OutputScalar>::Vectorizable &&
|
||||
functor_traits<BinaryFunctor>::PacketAccess>
|
||||
TensorBlockCwiseBinaryOpImpl;
|
||||
|
||||
struct BlockIteratorState {
|
||||
StorageIndex output_stride, output_span;
|
||||
StorageIndex left_stride, left_span;
|
||||
StorageIndex right_stride, right_span;
|
||||
StorageIndex size, count;
|
||||
};
|
||||
|
||||
template <typename LeftScalar, typename RightScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const BinaryFunctor& functor, const Dimensions& block_sizes,
|
||||
const Dimensions& block_strides, OutputScalar* output_data,
|
||||
const array<StorageIndex, NumDims>& left_strides,
|
||||
const LeftScalar* left_data,
|
||||
const array<StorageIndex, NumDims>& right_strides,
|
||||
const RightScalar* right_data) {
|
||||
// Find the innermost dimension whose size is not 1. This is the effective
|
||||
// inner dim. If all dimensions are of size 1, fallback to using the actual
|
||||
// innermost dim to avoid out-of-bound access.
|
||||
int num_size_one_inner_dims = 0;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block_sizes[dim] != 1) {
|
||||
num_size_one_inner_dims = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Calculate strides and dimensions.
|
||||
const int inner_dim =
|
||||
NumDims == 0 ? 1
|
||||
: cond<Layout>()(num_size_one_inner_dims,
|
||||
NumDims - num_size_one_inner_dims - 1);
|
||||
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
|
||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
// Merge multiple inner dims into one for larger inner dim size (i.e.
|
||||
// fewer calls to TensorBlockCwiseBinaryOp::Run()).
|
||||
if (inner_dim_size == block_strides[dim] &&
|
||||
block_strides[dim] == left_strides[dim] &&
|
||||
block_strides[dim] == right_strides[dim]) {
|
||||
inner_dim_size *= block_sizes[dim];
|
||||
++num_size_one_inner_dims;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StorageIndex output_index = 0, left_index = 0, right_index = 0;
|
||||
const StorageIndex output_stride =
|
||||
NumDims == 0 ? 1 : block_strides[inner_dim];
|
||||
const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
|
||||
const StorageIndex right_stride =
|
||||
NumDims == 0 ? 1 : right_strides[inner_dim];
|
||||
|
||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||
|
||||
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||
int num_squeezed_dims = 0;
|
||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||
const StorageIndex size = block_sizes[dim];
|
||||
if (size == 1) {
|
||||
continue;
|
||||
}
|
||||
BlockIteratorState& state = block_iter_state[num_squeezed_dims];
|
||||
state.output_stride = block_strides[dim];
|
||||
state.left_stride = left_strides[dim];
|
||||
state.right_stride = right_strides[dim];
|
||||
state.size = size;
|
||||
state.output_span = state.output_stride * (size - 1);
|
||||
state.left_span = state.left_stride * (size - 1);
|
||||
state.right_span = state.right_stride * (size - 1);
|
||||
state.count = 0;
|
||||
++num_squeezed_dims;
|
||||
}
|
||||
|
||||
// Compute cwise binary op.
|
||||
const StorageIndex block_total_size =
|
||||
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||
TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
|
||||
output_stride, output_data, left_index,
|
||||
left_stride, left_data, right_index,
|
||||
right_stride, right_data);
|
||||
// Update index.
|
||||
for (int j = 0; j < num_squeezed_dims; ++j) {
|
||||
BlockIteratorState& state = block_iter_state[j];
|
||||
if (++state.count < state.size) {
|
||||
output_index += state.output_stride;
|
||||
left_index += state.left_stride;
|
||||
right_index += state.right_stride;
|
||||
break;
|
||||
}
|
||||
state.count = 0;
|
||||
output_index -= state.output_span;
|
||||
left_index -= state.left_span;
|
||||
right_index -= state.right_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockView
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Read-only view into a block of data.
|
||||
*
|
||||
* This class provides read-only access to a block of data in impl. It may need
|
||||
* to allocate space for holding the intermediate result.
|
||||
*
|
||||
*/
|
||||
template <class ArgType, class Device>
|
||||
struct TensorBlockView {
|
||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||
typedef typename Impl::Index StorageIndex;
|
||||
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
|
||||
static const int NumDims = array_size<typename Impl::Dimensions>::value;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
// Constructs a TensorBlockView for `impl`. `block` is only used for for
|
||||
// specifying the start offset, shape, and strides of the block.
|
||||
template <typename OtherTensorBlock>
|
||||
TensorBlockView(const Device& device,
|
||||
const TensorEvaluator<ArgType, Device>& impl,
|
||||
const OtherTensorBlock& block)
|
||||
: m_device(device),
|
||||
m_block_sizes(block.block_sizes()),
|
||||
m_data(NULL),
|
||||
m_allocated_data(NULL) {
|
||||
if (Impl::RawAccess && impl.data() != NULL) {
|
||||
m_data = impl.data() + block.first_coeff_index();
|
||||
m_block_strides = block.tensor_strides();
|
||||
} else {
|
||||
// Actually make a copy.
|
||||
|
||||
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
|
||||
// Consider allowing ops to request additional temporary block memory in
|
||||
// TensorOpResourceRequirements.
|
||||
m_allocated_data = static_cast<Scalar*>(
|
||||
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
|
||||
m_data = m_allocated_data;
|
||||
if (NumDims > 0) {
|
||||
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
|
||||
m_block_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_block_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
|
||||
block.first_coeff_index(), m_block_sizes, m_block_strides,
|
||||
block.tensor_strides(), m_allocated_data);
|
||||
impl.block(&input_block);
|
||||
}
|
||||
}
|
||||
|
||||
~TensorBlockView() {
|
||||
if (m_allocated_data != NULL) {
|
||||
m_device.deallocate(m_allocated_data);
|
||||
}
|
||||
}
|
||||
|
||||
const Dimensions& block_sizes() const { return m_block_sizes; }
|
||||
const Dimensions& block_strides() const { return m_block_strides; }
|
||||
const Scalar* data() const { return m_data; }
|
||||
|
||||
private:
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
Dimensions m_block_sizes, m_block_strides;
|
||||
const Scalar* m_data; // Not owned.
|
||||
Scalar* m_allocated_data; // Owned.
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockMapper
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
@ -1108,137 +332,6 @@ class TensorBlockMapper {
|
||||
StorageIndex m_total_block_count;
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorSliceBlockMapper
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor slice block mapper class.
|
||||
*
|
||||
* This class is responsible for iterating over the blocks of
|
||||
* a slice of a tensor. Supports shuffling of the block strides
|
||||
* for callers that want to reduce strides for dimensions to be
|
||||
* processed together.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorSliceBlockMapper {
|
||||
public:
|
||||
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
TensorSliceBlockMapper(const Dimensions& tensor_dims,
|
||||
const Dimensions& tensor_slice_offsets,
|
||||
const Dimensions& tensor_slice_extents,
|
||||
const Dimensions& block_dim_sizes,
|
||||
const Dimensions& block_stride_order)
|
||||
: m_tensor_dimensions(tensor_dims),
|
||||
m_tensor_slice_offsets(tensor_slice_offsets),
|
||||
m_tensor_slice_extents(tensor_slice_extents),
|
||||
m_block_dim_sizes(block_dim_sizes),
|
||||
m_block_stride_order(block_stride_order),
|
||||
m_total_block_count(1) {
|
||||
// Calculate block counts by dimension and total block count.
|
||||
DSizes<StorageIndex, NumDims> block_count;
|
||||
for (Index i = 0; i < block_count.rank(); ++i) {
|
||||
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
|
||||
}
|
||||
m_total_block_count = array_prod(block_count);
|
||||
|
||||
// Calculate block strides (used for enumerating blocks).
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_block_strides[0] = 1;
|
||||
m_tensor_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
|
||||
m_tensor_strides[i] =
|
||||
m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_block_strides[NumDims - 1] = 1;
|
||||
m_tensor_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
|
||||
m_tensor_strides[i] =
|
||||
m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
|
||||
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
|
||||
StorageIndex first_coeff_index = 0;
|
||||
DSizes<StorageIndex, NumDims> coords;
|
||||
DSizes<StorageIndex, NumDims> sizes;
|
||||
DSizes<StorageIndex, NumDims> strides;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = block_index / m_block_strides[i];
|
||||
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
|
||||
sizes[i] = numext::mini(
|
||||
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
|
||||
m_block_dim_sizes[i]);
|
||||
block_index -= idx * m_block_strides[i];
|
||||
first_coeff_index += coords[i] * m_tensor_strides[i];
|
||||
}
|
||||
coords[0] =
|
||||
m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0];
|
||||
sizes[0] = numext::mini(
|
||||
m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
|
||||
m_block_dim_sizes[0]);
|
||||
first_coeff_index += coords[0] * m_tensor_strides[0];
|
||||
|
||||
StorageIndex prev_dim = m_block_stride_order[0];
|
||||
strides[prev_dim] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||
prev_dim = curr_dim;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const StorageIndex idx = block_index / m_block_strides[i];
|
||||
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
|
||||
sizes[i] = numext::mini(
|
||||
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
|
||||
m_block_dim_sizes[i]);
|
||||
block_index -= idx * m_block_strides[i];
|
||||
first_coeff_index += coords[i] * m_tensor_strides[i];
|
||||
}
|
||||
coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
|
||||
block_index * m_block_dim_sizes[NumDims - 1];
|
||||
sizes[NumDims - 1] = numext::mini(
|
||||
m_tensor_slice_offsets[NumDims - 1] +
|
||||
m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
|
||||
m_block_dim_sizes[NumDims - 1]);
|
||||
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
|
||||
|
||||
StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
|
||||
strides[prev_dim] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||
prev_dim = curr_dim;
|
||||
}
|
||||
}
|
||||
|
||||
return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
|
||||
return m_total_block_count;
|
||||
}
|
||||
|
||||
private:
|
||||
Dimensions m_tensor_dimensions;
|
||||
Dimensions m_tensor_slice_offsets;
|
||||
Dimensions m_tensor_slice_extents;
|
||||
Dimensions m_tensor_strides;
|
||||
Dimensions m_block_dim_sizes;
|
||||
Dimensions m_block_stride_order;
|
||||
Dimensions m_block_strides;
|
||||
StorageIndex m_total_block_count;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace Eigen
|
||||
|
@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -123,21 +122,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
// Block based access to the XprType (input) tensor.
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
// We do block based broadcasting using a trick with 2x tensor rank and 0
|
||||
// strides. See block method implementation for details.
|
||||
typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
|
||||
BroadcastTensorBlock;
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
|
||||
BroadcastTensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -641,246 +629,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
m_impl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (NumDims <= 0) {
|
||||
output_block->data()[0] = m_impl.coeff(0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Because we only support kSkewedInnerDims blocking, block size should be
|
||||
// equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
|
||||
// for the first outer dim, and 1 for other outer dims. This is guaranteed
|
||||
// by MergeResourceRequirements() in TensorBlock.h.
|
||||
const Dimensions& output_block_sizes = output_block->block_sizes();
|
||||
const Dimensions& output_block_strides = output_block->block_strides();
|
||||
|
||||
// Find where outer dims start.
|
||||
int outer_dim_start = 0;
|
||||
Index outer_dim_size = 1, inner_dim_size = 1;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i
|
||||
: NumDims - i - 1;
|
||||
if (i > outer_dim_start) {
|
||||
eigen_assert(output_block_sizes[dim] == 1);
|
||||
} else if (output_block_sizes[dim] != m_dimensions[dim]) {
|
||||
eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
|
||||
outer_dim_size = output_block_sizes[dim];
|
||||
} else {
|
||||
inner_dim_size *= output_block_sizes[dim];
|
||||
++outer_dim_start;
|
||||
}
|
||||
}
|
||||
|
||||
if (inner_dim_size == 0 || outer_dim_size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const Dimensions& input_dims = Dimensions(m_impl.dimensions());
|
||||
|
||||
// Pre-fill input_block_sizes, broadcast_block_sizes,
|
||||
// broadcast_block_strides, and broadcast_tensor_strides. Later on we will
|
||||
// only modify the outer_dim_start-th dimension on these arrays.
|
||||
|
||||
// Calculate the input block size for looking into the input.
|
||||
Dimensions input_block_sizes;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < outer_dim_start; ++i) {
|
||||
input_block_sizes[i] = input_dims[i];
|
||||
}
|
||||
for (int i = outer_dim_start; i < NumDims; ++i) {
|
||||
input_block_sizes[i] = 1;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < outer_dim_start; ++i) {
|
||||
input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
|
||||
}
|
||||
for (int i = outer_dim_start; i < NumDims; ++i) {
|
||||
input_block_sizes[NumDims - i - 1] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Broadcast with the 0-stride trick: Create 1 extra dim for each
|
||||
// broadcast, set the input stride to 0.
|
||||
//
|
||||
// When ColMajor:
|
||||
// - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
|
||||
//
|
||||
// - broadcast_block_strides is [output_block_strides[0],
|
||||
// output_block_strides[0] * d_0,
|
||||
// output_block_strides[1],
|
||||
// output_block_strides[1] * d_1,
|
||||
// ...].
|
||||
//
|
||||
// - broadcast_tensor_strides is [output_block_strides[0],
|
||||
// 0,
|
||||
// output_block_strides[1],
|
||||
// 0,
|
||||
// ...].
|
||||
BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
|
||||
broadcast_tensor_strides;
|
||||
|
||||
for (int i = 0; i < outer_dim_start; ++i) {
|
||||
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i
|
||||
: NumDims - i - 1;
|
||||
const int copy_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 2 * i
|
||||
: 2 * NumDims - 2 * i - 1;
|
||||
const int broadcast_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
|
||||
: copy_dim - 1;
|
||||
broadcast_block_sizes[copy_dim] = input_dims[dim];
|
||||
broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
|
||||
broadcast_block_strides[copy_dim] = output_block_strides[dim];
|
||||
broadcast_block_strides[broadcast_dim] =
|
||||
output_block_strides[dim] * input_dims[dim];
|
||||
broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
|
||||
broadcast_tensor_strides[broadcast_dim] = 0;
|
||||
}
|
||||
for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
|
||||
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i
|
||||
: 2 * NumDims - i - 1;
|
||||
broadcast_block_sizes[dim] = 1;
|
||||
broadcast_block_strides[dim] = 0;
|
||||
broadcast_tensor_strides[dim] = 0;
|
||||
}
|
||||
|
||||
const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? outer_dim_start
|
||||
: NumDims - outer_dim_start - 1;
|
||||
|
||||
if (outer_dim_size == 1) {
|
||||
// We just need one block read using the ready-set values above.
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||
output_block);
|
||||
} else if (input_dims[outer_dim] == 1) {
|
||||
// Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
|
||||
const int broadcast_outer_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 2 * outer_dim_start + 1
|
||||
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||
broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
|
||||
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||
broadcast_block_strides[broadcast_outer_dim] =
|
||||
output_block_strides[outer_dim];
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||
output_block);
|
||||
} else {
|
||||
// The general case. Let's denote the output block as x[...,
|
||||
// a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
|
||||
// the outer_dim_start-th dimension (< NumDims). We need to split the
|
||||
// a:a+outer_dim_size into possibly 3 sub-blocks:
|
||||
//
|
||||
// (1) a:b, where b is the smallest multiple of
|
||||
// input_dims[outer_dim_start] in [a, a+outer_dim_size].
|
||||
//
|
||||
// (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
|
||||
// in [a, a+outer_dim_size].
|
||||
//
|
||||
// (3) c:a+outer_dim_size .
|
||||
//
|
||||
// Or, when b and c do not exist, we just need to process the whole block
|
||||
// together.
|
||||
|
||||
// Find a.
|
||||
const Index outer_dim_left_index =
|
||||
output_block->first_coeff_index() / m_outputStrides[outer_dim];
|
||||
|
||||
// Find b and c.
|
||||
const Index input_outer_dim_size = input_dims[outer_dim];
|
||||
|
||||
// First multiple after a. This is b when <= outer_dim_left_index +
|
||||
// outer_dim_size.
|
||||
const Index first_multiple =
|
||||
divup<Index>(outer_dim_left_index, input_outer_dim_size) *
|
||||
input_outer_dim_size;
|
||||
|
||||
if (first_multiple <= outer_dim_left_index + outer_dim_size) {
|
||||
// b exists, so does c. Find it.
|
||||
const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
|
||||
input_outer_dim_size * input_outer_dim_size;
|
||||
const int copy_outer_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 2 * outer_dim_start
|
||||
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||
const int broadcast_outer_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 2 * outer_dim_start + 1
|
||||
: 2 * NumDims - 2 * outer_dim_start - 2;
|
||||
if (first_multiple > outer_dim_left_index) {
|
||||
const Index head_size = first_multiple - outer_dim_left_index;
|
||||
input_block_sizes[outer_dim] = head_size;
|
||||
broadcast_block_sizes[copy_outer_dim] = head_size;
|
||||
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||
broadcast_block_strides[copy_outer_dim] =
|
||||
output_block_strides[outer_dim];
|
||||
broadcast_block_sizes[broadcast_outer_dim] = 1;
|
||||
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||
broadcast_block_strides[broadcast_outer_dim] =
|
||||
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||
output_block);
|
||||
}
|
||||
if (first_multiple < last_multiple) {
|
||||
input_block_sizes[outer_dim] = input_outer_dim_size;
|
||||
broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
|
||||
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||
broadcast_block_strides[copy_outer_dim] =
|
||||
output_block_strides[outer_dim];
|
||||
broadcast_block_sizes[broadcast_outer_dim] =
|
||||
(last_multiple - first_multiple) / input_outer_dim_size;
|
||||
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||
broadcast_block_strides[broadcast_outer_dim] =
|
||||
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||
const Index offset = (first_multiple - outer_dim_left_index) *
|
||||
m_outputStrides[outer_dim];
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides,
|
||||
offset, output_block);
|
||||
}
|
||||
if (last_multiple < outer_dim_left_index + outer_dim_size) {
|
||||
const Index tail_size =
|
||||
outer_dim_left_index + outer_dim_size - last_multiple;
|
||||
input_block_sizes[outer_dim] = tail_size;
|
||||
broadcast_block_sizes[copy_outer_dim] = tail_size;
|
||||
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||
broadcast_block_strides[copy_outer_dim] =
|
||||
output_block_strides[outer_dim];
|
||||
broadcast_block_sizes[broadcast_outer_dim] = 1;
|
||||
broadcast_tensor_strides[broadcast_outer_dim] = 0;
|
||||
broadcast_block_strides[broadcast_outer_dim] =
|
||||
output_block_strides[outer_dim] * input_dims[outer_dim];
|
||||
const Index offset = (last_multiple - outer_dim_left_index) *
|
||||
m_outputStrides[outer_dim];
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides,
|
||||
offset, output_block);
|
||||
}
|
||||
} else {
|
||||
// b and c do not exist.
|
||||
const int copy_outer_dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 2 * outer_dim_start
|
||||
: 2 * NumDims - 2 * outer_dim_start - 1;
|
||||
input_block_sizes[outer_dim] = outer_dim_size;
|
||||
broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
|
||||
broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
|
||||
broadcast_block_strides[copy_outer_dim] =
|
||||
output_block_strides[outer_dim];
|
||||
BroadcastBlock(input_block_sizes, broadcast_block_sizes,
|
||||
broadcast_block_strides, broadcast_tensor_strides, 0,
|
||||
output_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -1096,28 +844,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
return params;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
|
||||
const Dimensions& input_block_sizes,
|
||||
const BroadcastDimensions& broadcast_block_sizes,
|
||||
const BroadcastDimensions& broadcast_block_strides,
|
||||
const BroadcastDimensions& broadcast_tensor_strides, Index offset,
|
||||
TensorBlock* output_block) const {
|
||||
TensorBlock input_view_block(
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? indexColMajor(output_block->first_coeff_index() + offset)
|
||||
: indexRowMajor(output_block->first_coeff_index() + offset),
|
||||
input_block_sizes, Dimensions(m_inputStrides),
|
||||
Dimensions(m_inputStrides), NULL);
|
||||
|
||||
internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
|
||||
input_view_block);
|
||||
BroadcastTensorBlock broadcast_block(
|
||||
0, broadcast_block_sizes, broadcast_block_strides,
|
||||
broadcast_tensor_strides, output_block->data() + offset);
|
||||
|
||||
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
|
||||
DSizes<Index, NumDims> dimensions;
|
||||
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
|
||||
|
@ -148,7 +148,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
IsAligned = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
// Chipping of outer-most dimension is a trivial operation, because we can
|
||||
// read and write directly from the underlying tensor using single offset.
|
||||
@ -167,11 +166,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
|
||||
InputTensorBlock;
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
OutputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -218,20 +212,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
}
|
||||
m_inputStride *= input_dims[m_dim.actualDim()];
|
||||
m_inputOffset = m_stride * op.offset();
|
||||
|
||||
if (BlockAccess) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputStrides[0] = 1;
|
||||
for (int i = 1; i < NumInputDims; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_inputStrides[NumInputDims - 1] = 1;
|
||||
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
@ -323,52 +303,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
m_impl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
// TODO(andydavis) Reduce the overhead of this function (experiment with
|
||||
// using a fixed block size).
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
OutputTensorBlock* output_block) const {
|
||||
// Calculate input block sizes.
|
||||
const DSizes<Index, NumDims>& output_block_sizes =
|
||||
output_block->block_sizes();
|
||||
const DSizes<Index, NumDims>& output_block_strides =
|
||||
output_block->block_strides();
|
||||
const Index chip_dim = m_dim.actualDim();
|
||||
DSizes<Index, NumInputDims> input_block_sizes;
|
||||
DSizes<Index, NumInputDims> input_block_strides;
|
||||
for (Index i = 0; i < NumInputDims; ++i) {
|
||||
if (i < chip_dim) {
|
||||
input_block_sizes[i] = output_block_sizes[i];
|
||||
input_block_strides[i] = output_block_strides[i];
|
||||
} else if (i > chip_dim) {
|
||||
input_block_sizes[i] = output_block_sizes[i - 1];
|
||||
input_block_strides[i] = output_block_strides[i - 1];
|
||||
} else {
|
||||
input_block_sizes[i] = 1;
|
||||
}
|
||||
}
|
||||
// Fix up input_block_stride for chip dimension.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
if (chip_dim == 0) {
|
||||
input_block_strides[chip_dim] = 1;
|
||||
} else {
|
||||
input_block_strides[chip_dim] =
|
||||
input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
|
||||
}
|
||||
} else {
|
||||
if (chip_dim == NumInputDims - 1) {
|
||||
input_block_strides[chip_dim] = 1;
|
||||
} else {
|
||||
input_block_strides[chip_dim] =
|
||||
input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
|
||||
}
|
||||
}
|
||||
// Instantiate and read input block from input tensor.
|
||||
InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||
input_block_sizes, input_block_strides,
|
||||
m_inputStrides, output_block->data());
|
||||
m_impl.block(&input_block);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool root_of_expr_ast = false) const {
|
||||
@ -482,7 +416,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
Index m_stride;
|
||||
Index m_inputOffset;
|
||||
Index m_inputStride;
|
||||
DSizes<Index, NumInputDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const internal::DimensionId<DimId> m_dim;
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
@ -508,7 +441,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
|
@ -125,7 +125,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
|
||||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
|
||||
@ -325,7 +324,6 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
|
||||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
|
||||
|
@ -381,7 +381,6 @@ struct TensorContractionEvaluatorBase
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
|
@ -302,7 +302,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
|
||||
#endif
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -309,7 +309,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, Device>::Layout,
|
||||
@ -787,7 +786,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
|
||||
|
@ -242,7 +242,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
||||
|
@ -95,7 +95,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<XprType, Device>::Layout,
|
||||
@ -269,7 +268,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
|
||||
|
@ -110,7 +110,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = true,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -120,9 +119,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
|
||||
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||
|
||||
typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout> TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -173,13 +169,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
m_impl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
|
||||
TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
|
||||
block->tensor_strides(), block->tensor_strides(),
|
||||
m_buffer + block->first_coeff_index());
|
||||
m_impl.block(&eval_to_block);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
|
||||
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
|
||||
// Add `m_buffer` as destination buffer to the block descriptor.
|
||||
@ -216,11 +205,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_buffer != NULL);
|
||||
TensorBlockReader::Run(block, m_buffer);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here.
|
||||
|
@ -45,7 +45,6 @@ struct TensorEvaluator
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
BlockAccessV2 = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Derived::Layout,
|
||||
@ -55,13 +54,6 @@ struct TensorEvaluator
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlockReader;
|
||||
typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -160,11 +152,6 @@ struct TensorEvaluator
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>*) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockReader::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -172,12 +159,6 @@ struct TensorEvaluator
|
||||
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||
const TensorBlock& block) {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockWriter::Run(block, m_data);
|
||||
}
|
||||
|
||||
template<typename TensorBlockV2>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||
@ -263,7 +244,6 @@ struct TensorEvaluator<const Derived, Device>
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
|
||||
BlockAccessV2 = internal::is_arithmetic<ScalarNoConst>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Derived::Layout,
|
||||
@ -271,11 +251,6 @@ struct TensorEvaluator<const Derived, Device>
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -348,11 +323,6 @@ struct TensorEvaluator<const Derived, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>*) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockReader::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -404,7 +374,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
&& (PacketType<CoeffReturnType, Device>::size >1)
|
||||
#endif
|
||||
,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -475,7 +444,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<UnaryOp>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -554,24 +522,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
m_argImpl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (NumDims <= 0) {
|
||||
output_block->data()[0] = coeff(0);
|
||||
return;
|
||||
}
|
||||
internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
|
||||
*output_block);
|
||||
internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
|
||||
Layout>::Run(m_functor,
|
||||
output_block->block_sizes(),
|
||||
output_block
|
||||
->block_strides(),
|
||||
output_block->data(),
|
||||
arg_block.block_strides(),
|
||||
arg_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -608,8 +558,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<BinaryOp>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
|
||||
@ -713,24 +661,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
m_rightImpl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (NumDims <= 0) {
|
||||
output_block->data()[0] = coeff(Index(0));
|
||||
return;
|
||||
}
|
||||
internal::TensorBlockView<LeftArgType, Device> left_block(
|
||||
m_device, m_leftImpl, *output_block);
|
||||
internal::TensorBlockView<RightArgType, Device> right_block(
|
||||
m_device, m_rightImpl, *output_block);
|
||||
internal::TensorBlockCwiseBinaryIO<
|
||||
BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
|
||||
Layout>::Run(m_functor, output_block->block_sizes(),
|
||||
output_block->block_strides(), output_block->data(),
|
||||
left_block.block_strides(), left_block.data(),
|
||||
right_block.block_strides(), right_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -768,7 +698,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
TensorEvaluator<Arg2Type, Device>::PacketAccess &&
|
||||
TensorEvaluator<Arg3Type, Device>::PacketAccess &&
|
||||
internal::functor_traits<TernaryOp>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
|
||||
TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
|
||||
@ -887,7 +816,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
|
||||
TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
||||
PacketType<Scalar, Device>::HasBlend,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
|
||||
TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
|
||||
TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
|
||||
|
@ -153,70 +153,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Process all the data with a single cpu thread, using blocks of data. By
|
||||
* sizing a block to fit L1 cache we get better cache performance.
|
||||
*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::Legacy> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
|
||||
typedef typename traits<Expression>::Index StorageIndex;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||
const DefaultDevice& device = DefaultDevice()) {
|
||||
typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
|
||||
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
|
||||
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
|
||||
|
||||
Evaluator evaluator(expr, device);
|
||||
Index total_size = array_prod(evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
if (total_size < cache_size
|
||||
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
|
||||
evaluator.cleanup();
|
||||
return;
|
||||
}
|
||||
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
// Size tensor blocks to fit in cache (or requested target block size).
|
||||
Index block_total_size = numext::mini(cache_size, total_size);
|
||||
TensorBlockShapeType block_shape = kSkewedInnerDims;
|
||||
// Query expression tree for desired block size/shape.
|
||||
std::vector<TensorOpResourceRequirements> resources;
|
||||
evaluator.getResourceRequirements(&resources);
|
||||
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||
|
||||
TensorBlockMapper block_mapper(
|
||||
TensorBlockDimensions(evaluator.dimensions()), block_shape,
|
||||
block_total_size);
|
||||
block_total_size = block_mapper.block_dims_total_size();
|
||||
|
||||
ScalarNoConst* data = static_cast<ScalarNoConst*>(
|
||||
device.allocate(block_total_size * sizeof(Scalar)));
|
||||
|
||||
const StorageIndex total_block_count = block_mapper.total_block_count();
|
||||
for (StorageIndex i = 0; i < total_block_count; ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
|
||||
evaluator.evalBlock(&block);
|
||||
}
|
||||
device.deallocate(data);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Process all the data with a single cpu thread, using blocks of data. By
|
||||
* sizing a block to fit L1 cache we get better cache performance.
|
||||
@ -446,59 +382,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::Legacy> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Index StorageIndex;
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
|
||||
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
||||
|
||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||
const ThreadPoolDevice& device) {
|
||||
Evaluator evaluator(expr, device);
|
||||
Index total_size = array_prod(evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
if (total_size < cache_size &&
|
||||
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::Off>::run(expr,
|
||||
device);
|
||||
evaluator.cleanup();
|
||||
return;
|
||||
}
|
||||
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||
if (needs_assign) {
|
||||
const TilingContext tiling =
|
||||
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
|
||||
Vectorizable>(device, evaluator);
|
||||
|
||||
device.parallelFor(
|
||||
tiling.block_mapper.total_block_count(), tiling.cost,
|
||||
[=, &device, &evaluator, &tiling](StorageIndex firstIdx,
|
||||
StorageIndex lastIdx) {
|
||||
ScalarNoConst* thread_buf =
|
||||
tiling.template GetCurrentThreadBuffer<ScalarNoConst>(device);
|
||||
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||
auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf);
|
||||
evaluator.evalBlock(&block);
|
||||
}
|
||||
});
|
||||
device.deallocate(tiling.buffer);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
/*Tiling=*/TiledEvaluation::On> {
|
||||
@ -603,91 +486,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||
};
|
||||
};
|
||||
|
||||
template <typename Expression, typename DoneCallback, bool Vectorizable>
|
||||
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||
Vectorizable, /*Tileable*/ TiledEvaluation::Legacy> {
|
||||
public:
|
||||
typedef typename traits<Expression>::Index StorageIndex;
|
||||
typedef typename traits<Expression>::Scalar Scalar;
|
||||
typedef typename remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
|
||||
Evaluator::Layout>
|
||||
BlockMapper;
|
||||
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
|
||||
|
||||
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
|
||||
const ThreadPoolDevice& device,
|
||||
DoneCallback done) {
|
||||
TensorAsyncExecutorContext* const ctx =
|
||||
new TensorAsyncExecutorContext(expr, device, std::move(done));
|
||||
|
||||
Index total_size = array_prod(ctx->evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
if (total_size < cache_size &&
|
||||
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||
auto delete_ctx = [ctx]() { delete ctx; };
|
||||
internal::TensorAsyncExecutor<
|
||||
Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
|
||||
/*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
|
||||
if (!need_assign) {
|
||||
delete ctx;
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->tiling =
|
||||
GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(
|
||||
device, ctx->evaluator);
|
||||
|
||||
auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
|
||||
ScalarNoConst* thread_buf =
|
||||
ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
|
||||
ctx->device);
|
||||
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||
auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
|
||||
ctx->evaluator.evalBlock(&block);
|
||||
}
|
||||
};
|
||||
device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
|
||||
ctx->tiling.cost, eval_block,
|
||||
[ctx]() { delete ctx; });
|
||||
};
|
||||
|
||||
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
|
||||
}
|
||||
|
||||
private:
|
||||
struct TensorAsyncExecutorContext {
|
||||
TensorAsyncExecutorContext(const Expression& expr,
|
||||
const ThreadPoolDevice& thread_pool,
|
||||
DoneCallback done)
|
||||
: device(thread_pool),
|
||||
evaluator(expr, thread_pool),
|
||||
on_done(std::move(done)) {}
|
||||
|
||||
~TensorAsyncExecutorContext() {
|
||||
device.deallocate(tiling.buffer);
|
||||
evaluator.cleanup();
|
||||
on_done();
|
||||
}
|
||||
|
||||
const ThreadPoolDevice& device;
|
||||
Evaluator evaluator;
|
||||
TilingContext tiling;
|
||||
|
||||
private:
|
||||
DoneCallback on_done;
|
||||
};
|
||||
};
|
||||
|
||||
template <typename Expression, typename DoneCallback, bool Vectorizable>
|
||||
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
|
||||
Vectorizable, /*Tileable*/ TiledEvaluation::On> {
|
||||
|
@ -133,7 +133,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = true,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -41,7 +41,6 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
|
||||
enum {
|
||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||
|
@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
|
||||
BlockAccessV2 = internal::is_arithmetic<CoeffReturnType>::value,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -105,11 +104,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
|
||||
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||
|
||||
typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -185,11 +179,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>*) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_buffer != NULL);
|
||||
TensorBlockReader::Run(block, m_buffer);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
|
@ -158,7 +158,6 @@ struct IsVectorizable<GpuDevice, Expression> {
|
||||
enum TiledEvaluation {
|
||||
Off = 0, // tiled evaluation is not supported
|
||||
On = 1, // still work in progress (see TensorBlockV2.h)
|
||||
Legacy = 2 // soon to be deprecated (see TensorBock.h)
|
||||
};
|
||||
|
||||
template <typename Device, typename Expression>
|
||||
@ -166,18 +165,12 @@ struct IsTileable {
|
||||
// Check that block evaluation is supported and it's a preferred option (at
|
||||
// least one sub-expression has much faster block evaluation, e.g.
|
||||
// broadcasting).
|
||||
static const bool BlockAccess =
|
||||
TensorEvaluator<Expression, Device>::BlockAccess &&
|
||||
TensorEvaluator<Expression, Device>::PreferBlockAccess;
|
||||
|
||||
static const bool BlockAccessV2 =
|
||||
TensorEvaluator<Expression, Device>::BlockAccessV2 &&
|
||||
TensorEvaluator<Expression, Device>::PreferBlockAccess;
|
||||
|
||||
static const TiledEvaluation value =
|
||||
BlockAccessV2
|
||||
? TiledEvaluation::On
|
||||
: (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
|
||||
BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off;
|
||||
};
|
||||
|
||||
template <typename Expression, typename Device,
|
||||
|
@ -93,7 +93,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = true,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -183,60 +182,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
Index count;
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (NumDims <= 0) return;
|
||||
|
||||
static const bool is_col_major =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
|
||||
// Compute spatial coordinates for the first block element.
|
||||
array<Index, NumDims> coords;
|
||||
extract_coordinates(output_block->first_coeff_index(), coords);
|
||||
array<Index, NumDims> initial_coords = coords;
|
||||
|
||||
CoeffReturnType* data = output_block->data();
|
||||
Index offset = 0;
|
||||
|
||||
// Initialize output block iterator state. Dimension in this array are
|
||||
// always in inner_most -> outer_most order (col major layout).
|
||||
array<BlockIteratorState, NumDims> it;
|
||||
for (Index i = 0; i < NumDims; ++i) {
|
||||
const Index dim = is_col_major ? i : NumDims - 1 - i;
|
||||
it[i].size = output_block->block_sizes()[dim];
|
||||
it[i].stride = output_block->block_strides()[dim];
|
||||
it[i].span = it[i].stride * (it[i].size - 1);
|
||||
it[i].count = 0;
|
||||
}
|
||||
eigen_assert(it[0].stride == 1);
|
||||
|
||||
while (it[NumDims - 1].count < it[NumDims - 1].size) {
|
||||
// Generate data for the inner-most dimension.
|
||||
for (Index i = 0; i < it[0].size; ++i) {
|
||||
*(data + offset + i) = m_generator(coords);
|
||||
coords[is_col_major ? 0 : NumDims - 1]++;
|
||||
}
|
||||
coords[is_col_major ? 0 : NumDims - 1] =
|
||||
initial_coords[is_col_major ? 0 : NumDims - 1];
|
||||
|
||||
// For the 1d tensor we need to generate only one inner-most dimension.
|
||||
if (NumDims == 1) break;
|
||||
|
||||
// Update offset.
|
||||
for (Index i = 1; i < NumDims; ++i) {
|
||||
if (++it[i].count < it[i].size) {
|
||||
offset += it[i].stride;
|
||||
coords[is_col_major ? i : NumDims - 1 - i]++;
|
||||
break;
|
||||
}
|
||||
if (i != NumDims - 1) it[i].count = 0;
|
||||
coords[is_col_major ? i : NumDims - 1 - i] =
|
||||
initial_coords[is_col_major ? i : NumDims - 1 - i];
|
||||
offset -= it[i].span;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
|
@ -231,7 +231,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -541,139 +540,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
internal::kSkewedInnerDims, block_total_size_max));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
OutputTensorBlock* output_block) const {
|
||||
typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
|
||||
typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
|
||||
|
||||
// Calculate loop limits and various input/output dim sizes.
|
||||
const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
|
||||
const bool col_major =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
|
||||
const Index output_depth_dim_size =
|
||||
m_dimensions[col_major ? 0 : NumDims - 1];
|
||||
const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
|
||||
const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
|
||||
const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
|
||||
const Index block_col_stride = row_dim_size * depth_dim_size;
|
||||
const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
|
||||
const Index outer_dim_size =
|
||||
block_sizes.TotalSize() /
|
||||
(depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
|
||||
|
||||
const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
|
||||
const Index batch_size = patch_size * patch_index_dim_size;
|
||||
|
||||
Index output_index = output_block->first_coeff_index();
|
||||
|
||||
// Loop through outer dimensions.
|
||||
for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
|
||||
++outer_dim_index) {
|
||||
const Index outer_output_base_index = outer_dim_index * batch_size;
|
||||
// Find the offset of the element wrt the location of the first element.
|
||||
const Index patchIndexStart = output_index / m_fastPatchStride;
|
||||
const Index patchOffset =
|
||||
(output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
|
||||
const Index colOffsetStart = patchOffset / m_fastColStride;
|
||||
// Other ways to index this element.
|
||||
const Index otherIndex =
|
||||
(NumDims == 4) ? 0 : output_index / m_fastOtherStride;
|
||||
const Index patch2DIndexStart =
|
||||
(NumDims == 4)
|
||||
? 0
|
||||
: (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||
// Calculate starting depth index.
|
||||
const Index depth = output_index - (output_index / m_fastOutputDepth) *
|
||||
output_depth_dim_size;
|
||||
const Index patch_input_base_index =
|
||||
depth + otherIndex * m_patchInputStride;
|
||||
|
||||
// Loop through patches.
|
||||
for (Index patch_index_dim_index = 0;
|
||||
patch_index_dim_index < patch_index_dim_size;
|
||||
++patch_index_dim_index) {
|
||||
const Index patch_output_base_index =
|
||||
outer_output_base_index + patch_index_dim_index * patch_size;
|
||||
// Patch index corresponding to the passed in index.
|
||||
const Index patchIndex = patchIndexStart + patch_index_dim_index;
|
||||
const Index patch2DIndex =
|
||||
(NumDims == 4) ? patchIndex
|
||||
: patch2DIndexStart + patch_index_dim_index;
|
||||
const Index colIndex = patch2DIndex / m_fastOutputRows;
|
||||
const Index input_col_base = colIndex * m_col_strides;
|
||||
const Index row_offset_base =
|
||||
(patch2DIndex - colIndex * m_outputRows) * m_row_strides -
|
||||
m_rowPaddingTop;
|
||||
|
||||
// Loop through columns.
|
||||
for (Index col_dim_index = 0; col_dim_index < col_dim_size;
|
||||
++col_dim_index) {
|
||||
const Index col_output_base_index =
|
||||
patch_output_base_index + col_dim_index * block_col_stride;
|
||||
|
||||
// Calculate col index in the input original tensor.
|
||||
Index colOffset = colOffsetStart + col_dim_index;
|
||||
Index inputCol =
|
||||
input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
|
||||
Index origInputCol =
|
||||
(m_col_inflate_strides == 1)
|
||||
? inputCol
|
||||
: ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
|
||||
|
||||
bool pad_column = false;
|
||||
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
|
||||
((m_col_inflate_strides != 1) &&
|
||||
(inputCol != origInputCol * m_col_inflate_strides))) {
|
||||
pad_column = true;
|
||||
}
|
||||
|
||||
const Index col_input_base_index =
|
||||
patch_input_base_index + origInputCol * m_colInputStride;
|
||||
const Index input_row_base =
|
||||
row_offset_base +
|
||||
((patchOffset + col_dim_index * output_row_dim_size) -
|
||||
colOffset * m_colStride) *
|
||||
m_in_row_strides;
|
||||
// Loop through rows.
|
||||
for (Index row_dim_index = 0; row_dim_index < row_dim_size;
|
||||
++row_dim_index) {
|
||||
const Index output_base_index =
|
||||
col_output_base_index + row_dim_index * depth_dim_size;
|
||||
bool pad_row = false;
|
||||
Index inputIndex;
|
||||
if (!pad_column) {
|
||||
Index inputRow =
|
||||
input_row_base + row_dim_index * m_in_row_strides;
|
||||
Index origInputRow =
|
||||
(m_row_inflate_strides == 1)
|
||||
? inputRow
|
||||
: ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
|
||||
: 0);
|
||||
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
|
||||
((m_row_inflate_strides != 1) &&
|
||||
(inputRow != origInputRow * m_row_inflate_strides))) {
|
||||
pad_row = true;
|
||||
} else {
|
||||
inputIndex =
|
||||
col_input_base_index + origInputRow * m_rowInputStride;
|
||||
}
|
||||
}
|
||||
// Copy (or pad) along depth dimension.
|
||||
if (pad_column || pad_row) {
|
||||
ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
|
||||
output_base_index, output_block->data());
|
||||
} else {
|
||||
ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
|
||||
output_block->data(), inputIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
output_index += m_otherStride;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
|
@ -92,7 +92,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -119,7 +119,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
@ -199,7 +198,6 @@ template<typename ArgType, typename Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
|
@ -135,11 +135,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
// TODO(andydavis, wuke) Enable BlockAccess for the general case when the
|
||||
// performance issue with block-based reshape is resolved.
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
|
||||
TensorEvaluator<ArgType, Device>::RawAccess &&
|
||||
NumInputDims > 0 && NumOutputDims > 0,
|
||||
// For trivial reshapes with raw access to underlying data we will provide
|
||||
// zero overhead block access.
|
||||
// TODO(ezhulenev): Consider adding block access without raw access?
|
||||
@ -153,14 +148,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
|
||||
InputTensorBlock;
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
|
||||
OutputTensorBlock;
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
|
||||
Layout>
|
||||
OutputTensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -177,30 +164,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
// The total size of the reshaped tensor must be equal to the total size
|
||||
// of the input tensor.
|
||||
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
|
||||
|
||||
if (BlockAccess) {
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
|
||||
m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumOutputDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||
}
|
||||
m_inputStrides[0] = 1;
|
||||
for (int i = 1; i < NumInputDims; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_outputStrides[NumOutputDims - 1] = 1;
|
||||
for (int i = NumOutputDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||
}
|
||||
m_inputStrides[NumInputDims - 1] = 1;
|
||||
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
@ -249,128 +212,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
Index size;
|
||||
Index count;
|
||||
};
|
||||
// TODO(andydavis) Reduce the overhead of this function.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
OutputTensorBlock* output_block) const {
|
||||
if (m_impl.data() != NULL) {
|
||||
OutputTensorBlockReader::Run(output_block, m_impl.data());
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate output block unit-stride inner dimension length.
|
||||
const DSizes<Index, NumOutputDims>& output_block_sizes =
|
||||
output_block->block_sizes();
|
||||
Index output_inner_dim_size = 1;
|
||||
Index output_outer_dim_start = NumOutputDims;
|
||||
for (Index i = 0; i < NumOutputDims; ++i) {
|
||||
const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i : NumOutputDims - i - 1;
|
||||
output_inner_dim_size *= output_block_sizes[dim];
|
||||
if (output_block_sizes[dim] < m_dimensions[dim]) {
|
||||
output_outer_dim_start = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize output block iterator state.
|
||||
array<BlockIteratorState, NumOutputDims> block_iter_state;
|
||||
|
||||
for (Index i = 0; i < NumOutputDims; ++i) {
|
||||
const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i : NumOutputDims - i - 1;
|
||||
block_iter_state[i].size = output_block_sizes[dim];
|
||||
block_iter_state[i].stride = m_outputStrides[dim];
|
||||
block_iter_state[i].span =
|
||||
block_iter_state[i].stride * (block_iter_state[i].size - 1);
|
||||
block_iter_state[i].count = 0;
|
||||
}
|
||||
|
||||
const Index output_outer_dim_size = output_block_sizes.TotalSize() /
|
||||
output_inner_dim_size;
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
|
||||
m_impl.dimensions();
|
||||
|
||||
Index index = output_block->first_coeff_index();
|
||||
for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
|
||||
Index inner_idx = 0;
|
||||
while (inner_idx < output_inner_dim_size) {
|
||||
// Calculate input coords based on 'index'.
|
||||
array<Index, NumInputDims> input_coords;
|
||||
Index idx = index;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumInputDims - 1; i > 0; --i) {
|
||||
input_coords[i] = idx / m_inputStrides[i];
|
||||
idx -= input_coords[i] * m_inputStrides[i];
|
||||
}
|
||||
input_coords[0] = idx;
|
||||
} else {
|
||||
for (int i = 0; i < NumInputDims - 1; ++i) {
|
||||
input_coords[i] = idx / m_inputStrides[i];
|
||||
idx -= input_coords[i] * m_inputStrides[i];
|
||||
}
|
||||
input_coords[NumInputDims - 1] = idx;
|
||||
}
|
||||
|
||||
// Calculate target input block shape, using at most
|
||||
// 'output_inner_dim_size' coefficients along the input block's inner
|
||||
// dimensions.
|
||||
DSizes<Index, NumInputDims> input_block_sizes;
|
||||
Index num_to_allocate = output_inner_dim_size - inner_idx;
|
||||
for (Index i = 0; i < NumInputDims; ++i) {
|
||||
const Index dim =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i : NumInputDims - i - 1;
|
||||
input_block_sizes[dim] = numext::mini(
|
||||
num_to_allocate, (static_cast<Index>(input_dims[dim]) -
|
||||
input_coords[dim]));
|
||||
if (input_coords[dim] == 0) {
|
||||
num_to_allocate /= input_block_sizes[dim];
|
||||
} else {
|
||||
num_to_allocate = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate input block strides.
|
||||
DSizes<Index, NumInputDims> input_block_strides;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
input_block_strides[0] = 1;
|
||||
for (int i = 1; i < NumInputDims; ++i) {
|
||||
input_block_strides[i] = input_block_strides[i - 1] *
|
||||
input_block_sizes[i - 1];
|
||||
}
|
||||
} else {
|
||||
input_block_strides[NumInputDims - 1] = 1;
|
||||
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||
input_block_strides[i] = input_block_strides[i + 1] *
|
||||
input_block_sizes[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiate and read input block from input tensor.
|
||||
InputTensorBlock input_block(index, input_block_sizes,
|
||||
input_block_strides, m_inputStrides,
|
||||
output_block->data() + outer_idx *
|
||||
output_inner_dim_size + inner_idx);
|
||||
|
||||
m_impl.block(&input_block);
|
||||
|
||||
const Index input_block_total_size = input_block_sizes.TotalSize();
|
||||
index += input_block_total_size;
|
||||
inner_idx += input_block_total_size;
|
||||
}
|
||||
eigen_assert(inner_idx == output_inner_dim_size);
|
||||
index -= output_inner_dim_size;
|
||||
// Update index.
|
||||
for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
|
||||
if (++block_iter_state[i].count < block_iter_state[i].size) {
|
||||
index += block_iter_state[i].stride;
|
||||
break;
|
||||
}
|
||||
block_iter_state[i].count = 0;
|
||||
index -= block_iter_state[i].span;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
@ -408,8 +249,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
NewDimensions m_dimensions;
|
||||
DSizes<Index, NumOutputDims> m_outputStrides;
|
||||
DSizes<Index, NumInputDims> m_inputStrides;
|
||||
};
|
||||
|
||||
|
||||
@ -426,7 +265,6 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -619,7 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
// slice offsets and sizes.
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -714,7 +551,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
}
|
||||
}
|
||||
// Use memcpy if it's going to be faster than using the regular evaluation.
|
||||
const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
|
||||
const MemcpyTriggerForSlicing<Index, Device, BlockAccessV2> trigger(m_device);
|
||||
if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
|
||||
EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
|
||||
for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
|
||||
@ -808,16 +645,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
m_impl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||
output_block->block_sizes(),
|
||||
output_block->block_strides(),
|
||||
TensorBlockDimensions(m_inputStrides),
|
||||
output_block->data());
|
||||
m_impl.block(&input_block);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -922,7 +749,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -1124,7 +950,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
// slice offsets and sizes.
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -1306,7 +1131,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -584,7 +584,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -594,11 +593,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
|
||||
OutputTensorBlock;
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
|
||||
InputTensorBlock;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
||||
//===--------------------------------------------------------------------===//
|
||||
@ -920,258 +914,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
m_impl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
|
||||
OutputTensorBlock* output_block) const {
|
||||
// Special case full reductions to avoid input block copy below.
|
||||
if (NumInputDims == NumReducedDims) {
|
||||
eigen_assert(output_block->first_coeff_index() == 0);
|
||||
eigen_assert(output_block->block_sizes().TotalSize() == 1);
|
||||
Op reducer(m_reducer);
|
||||
output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
|
||||
*this, 0, m_numValuesToReduce, reducer);
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate input tensor 'slice' required to reduce output block coeffs.
|
||||
DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
|
||||
for (int i = 0; i < NumOutputDims; ++i) {
|
||||
// Clip preserved input dimensions by output block size.
|
||||
input_slice_sizes[m_output_to_input_dim_map[i]] =
|
||||
output_block->block_sizes()[i];
|
||||
}
|
||||
|
||||
// Shard input tensor slice into blocks (because it could be large if we
|
||||
// need to reduce along several dimensions to calculate required output
|
||||
// coefficients).
|
||||
const Index max_coeff_count =
|
||||
numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
|
||||
input_slice_sizes.TotalSize());
|
||||
|
||||
// Calculate max output shard size needed to keep working set of reducers
|
||||
// in L1, while leaving enough space for reducer overhead and 'PacketSize'
|
||||
// reductions.
|
||||
DSizes<Index, NumInputDims> target_input_block_sizes;
|
||||
CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
|
||||
&target_input_block_sizes);
|
||||
// Calculate indices for first preserved dimension.
|
||||
const Index first_preserved_dim_output_index =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumOutputDims - 1;
|
||||
const Index first_preserved_dim_input_index =
|
||||
m_output_to_input_dim_map[first_preserved_dim_output_index];
|
||||
const bool inner_most_dim_preserved =
|
||||
PreservingInnerMostDims ||
|
||||
(first_preserved_dim_input_index ==
|
||||
(static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumInputDims - 1));
|
||||
|
||||
// Calculate output block inner/outer dimension sizes.
|
||||
const Index output_block_inner_dim_size =
|
||||
output_block->block_sizes()[first_preserved_dim_output_index];
|
||||
const Index output_block_outer_dim_size =
|
||||
output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
|
||||
// Calculate shard size for first preserved dimension.
|
||||
const Index output_shard_size =
|
||||
target_input_block_sizes[first_preserved_dim_input_index];
|
||||
const Index num_output_shards =
|
||||
(output_block_inner_dim_size + output_shard_size - 1) /
|
||||
output_shard_size;
|
||||
|
||||
// Initialize 'tensor_slice_offsets' from input coords of output index.
|
||||
DSizes<Index, NumInputDims> tensor_slice_offsets;
|
||||
GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
|
||||
&tensor_slice_offsets);
|
||||
|
||||
// Store tensor slice offset in first preserved dimension to be used
|
||||
// to update tensor slice extents in loop below.
|
||||
const Index first_preserved_dim_offset_start =
|
||||
tensor_slice_offsets[first_preserved_dim_input_index];
|
||||
|
||||
array<BlockIteratorState, NumOutputDims> block_iter_state;
|
||||
|
||||
// Initialize state used to iterate through output coefficients
|
||||
// and update 'tensor_slice_offsets' in outer preserved dims.
|
||||
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||
const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? i + 1
|
||||
: NumOutputDims - i - 2;
|
||||
block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
|
||||
block_iter_state[i].output_size = output_block->block_sizes()[dim];
|
||||
block_iter_state[i].output_count = 0;
|
||||
}
|
||||
|
||||
// Allocate input block memory.
|
||||
ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
|
||||
m_device.allocate(max_coeff_count * sizeof(Scalar)));
|
||||
// Allocate reducer memory.
|
||||
const bool packet_reductions_enabled =
|
||||
(Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
|
||||
const Index num_reducers =
|
||||
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||
? (output_shard_size / PacketSize + output_shard_size % PacketSize +
|
||||
PacketSize)
|
||||
: output_shard_size;
|
||||
typedef internal::BlockReducer<Self, Op> BlockReducer;
|
||||
BlockReducer* reducers = static_cast<BlockReducer*>(
|
||||
m_device.allocate(num_reducers * sizeof(BlockReducer)));
|
||||
|
||||
InputDimensions input_tensor_dims(m_impl.dimensions());
|
||||
for (Index output_outer_index = 0;
|
||||
output_outer_index < output_block_outer_dim_size;
|
||||
++output_outer_index) {
|
||||
for (Index output_shard_index = 0; output_shard_index < num_output_shards;
|
||||
++output_shard_index) {
|
||||
// Initialize 'tensor_slice_extents' for this output shard.
|
||||
DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
if (i == first_preserved_dim_input_index) {
|
||||
// Clip first preserved dim size to output shard size.
|
||||
tensor_slice_extents[i] = numext::mini(
|
||||
output_shard_size,
|
||||
input_slice_sizes[i] - (tensor_slice_offsets[i] -
|
||||
first_preserved_dim_offset_start));
|
||||
|
||||
} else if (!m_reduced[i]) {
|
||||
// Clip outer preserved dims to size 1, so that we reduce a
|
||||
// contiguous set of output coefficients.
|
||||
tensor_slice_extents[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize output coefficient reducers.
|
||||
for (int i = 0; i < num_reducers; ++i) {
|
||||
new (&reducers[i]) BlockReducer(m_reducer);
|
||||
}
|
||||
|
||||
typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
|
||||
NumInputDims, Layout>
|
||||
TensorSliceBlockMapper;
|
||||
|
||||
// TODO(andydavis) Consider removing 'input_block_stride_order' if we
|
||||
// find that scattered reads are not worth supporting in
|
||||
// TensorSliceBlockMapper.
|
||||
TensorSliceBlockMapper block_mapper(
|
||||
typename TensorSliceBlockMapper::Dimensions(input_tensor_dims),
|
||||
tensor_slice_offsets, tensor_slice_extents,
|
||||
target_input_block_sizes, DimensionList<Index, NumInputDims>());
|
||||
|
||||
const Index num_outputs_to_update =
|
||||
tensor_slice_extents[first_preserved_dim_input_index];
|
||||
const Index preserved_dim_vector_reducer_count =
|
||||
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||
? num_outputs_to_update / PacketSize
|
||||
: 0;
|
||||
const Index preserved_dim_vector_coeff_count =
|
||||
inner_most_dim_preserved
|
||||
? preserved_dim_vector_reducer_count * PacketSize
|
||||
: 0;
|
||||
const Index preserved_dim_reducer_limit =
|
||||
(inner_most_dim_preserved && packet_reductions_enabled)
|
||||
? (preserved_dim_vector_reducer_count +
|
||||
num_outputs_to_update % PacketSize)
|
||||
: num_outputs_to_update;
|
||||
|
||||
const Index total_block_count = block_mapper.total_block_count();
|
||||
for (Index b = 0; b < total_block_count; ++b) {
|
||||
InputTensorBlock input_block =
|
||||
block_mapper.GetBlockForIndex(b, input_block_data);
|
||||
// Read.
|
||||
m_impl.block(&input_block);
|
||||
|
||||
Index num_values_to_reduce = 1;
|
||||
for (Index i = 0; i < NumInputDims; ++i) {
|
||||
if (m_reduced[i]) {
|
||||
num_values_to_reduce *= input_block.block_sizes()[i];
|
||||
}
|
||||
}
|
||||
// Reduce.
|
||||
if (inner_most_dim_preserved) {
|
||||
const Index input_outer_dim_size =
|
||||
input_block.block_sizes().TotalSize() / num_outputs_to_update;
|
||||
for (Index input_outer_dim_index = 0;
|
||||
input_outer_dim_index < input_outer_dim_size;
|
||||
++input_outer_dim_index) {
|
||||
const Index input_outer_dim_base =
|
||||
input_outer_dim_index * num_outputs_to_update;
|
||||
for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
|
||||
reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
|
||||
PacketSize, input_block.data());
|
||||
}
|
||||
const Index scalar_reducer_base =
|
||||
input_outer_dim_base + preserved_dim_vector_coeff_count;
|
||||
for (Index i = preserved_dim_vector_reducer_count;
|
||||
i < preserved_dim_reducer_limit; ++i) {
|
||||
reducers[i].Reduce(scalar_reducer_base + i -
|
||||
preserved_dim_vector_reducer_count,
|
||||
1, input_block.data());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (Index i = 0; i < num_outputs_to_update; ++i) {
|
||||
reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
|
||||
input_block.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Finalize all reducers for this output shard.
|
||||
const Index output_base_index =
|
||||
output_outer_index * output_block_inner_dim_size +
|
||||
output_shard_index * output_shard_size;
|
||||
if (inner_most_dim_preserved) {
|
||||
EIGEN_ALIGN_MAX
|
||||
typename internal::remove_const<CoeffReturnType>::type
|
||||
values[PacketSize];
|
||||
for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
|
||||
const Index reducer_base = output_base_index + i * PacketSize;
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(
|
||||
values, reducers[i].FinalizePacket());
|
||||
for (Index j = 0; j < PacketSize; ++j) {
|
||||
output_block->data()[reducer_base + j] = values[j];
|
||||
}
|
||||
}
|
||||
const Index scalar_reducer_base =
|
||||
output_base_index + preserved_dim_vector_coeff_count;
|
||||
|
||||
for (Index i = preserved_dim_vector_reducer_count;
|
||||
i < preserved_dim_reducer_limit; ++i) {
|
||||
output_block->data()[scalar_reducer_base + i -
|
||||
preserved_dim_vector_reducer_count] =
|
||||
reducers[i].Finalize();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num_outputs_to_update; ++i) {
|
||||
output_block->data()[output_base_index + i] =
|
||||
reducers[i].Finalize();
|
||||
}
|
||||
}
|
||||
|
||||
// Update 'tensor_slice_offsets' by num outputs for this output shard.
|
||||
tensor_slice_offsets[first_preserved_dim_input_index] +=
|
||||
num_outputs_to_update;
|
||||
}
|
||||
// Update slice offset for inner preserved dim.
|
||||
tensor_slice_offsets[first_preserved_dim_input_index] -=
|
||||
output_block_inner_dim_size;
|
||||
// Update slice offsets for remaining output dims.
|
||||
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||
BlockIteratorState& b = block_iter_state[i];
|
||||
if (++b.output_count < b.output_size) {
|
||||
++tensor_slice_offsets[b.input_dim];
|
||||
break;
|
||||
}
|
||||
b.output_count = 0;
|
||||
tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Free memory.
|
||||
m_device.deallocate(input_block_data);
|
||||
m_device.deallocate(reducers);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
|
||||
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
|
||||
|
@ -141,7 +141,6 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = PlainObjectType::Layout,
|
||||
@ -378,7 +377,6 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorRef<Derived>::Layout,
|
||||
@ -432,7 +430,6 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
RawAccess = false
|
||||
|
@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = true,
|
||||
BlockAccessV2 = NumDims > 0,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -248,112 +247,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
internal::kSkewedInnerDims, block_total_size_max));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
OutputTensorBlock* output_block) const {
|
||||
if (NumDims <= 0) return;
|
||||
|
||||
// TODO(ezhulenev): If underlying tensor expression supports and prefers
|
||||
// block evaluation we must use it. Currently we use coeff and packet
|
||||
// access into the underlying tensor expression.
|
||||
// static const bool useBlockAccessForArgType =
|
||||
// TensorEvaluator<ArgType, Device>::BlockAccess &&
|
||||
// TensorEvaluator<ArgType, Device>::PreferBlockAccess;
|
||||
|
||||
static const bool isColMajor =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
|
||||
static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
|
||||
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
|
||||
|
||||
CoeffReturnType* data = output_block->data();
|
||||
Index block_offset = 0;
|
||||
|
||||
Index input_offset = reverseIndex(output_block->first_coeff_index());
|
||||
|
||||
// Initialize output block iterator state. Dimension in this array are
|
||||
// always in inner_most -> outer_most order (col major layout).
|
||||
array<BlockIteratorState, NumDims> it;
|
||||
for (Index i = 0; i < NumDims; ++i) {
|
||||
const Index dim = isColMajor ? i : NumDims - 1 - i;
|
||||
it[i].size = output_block->block_sizes()[dim];
|
||||
it[i].count = 0;
|
||||
it[i].reverse = m_reverse[dim];
|
||||
|
||||
it[i].block_stride = output_block->block_strides()[dim];
|
||||
it[i].block_span = it[i].block_stride * (it[i].size - 1);
|
||||
|
||||
it[i].input_stride = m_strides[dim];
|
||||
it[i].input_span = it[i].input_stride * (it[i].size - 1);
|
||||
|
||||
if (it[i].reverse) {
|
||||
it[i].input_stride = -1 * it[i].input_stride;
|
||||
it[i].input_span = -1 * it[i].input_span;
|
||||
}
|
||||
}
|
||||
|
||||
// If multiple inner dimensions have the same reverse flag, check if we can
|
||||
// merge them into a single virtual inner dimension.
|
||||
int effective_inner_dim = 0;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
if (it[i].reverse != it[effective_inner_dim].reverse) break;
|
||||
if (it[i].block_stride != it[effective_inner_dim].size) break;
|
||||
if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
|
||||
|
||||
it[i].size = it[effective_inner_dim].size * it[i].size;
|
||||
|
||||
it[i].block_stride = 1;
|
||||
it[i].input_stride = (inner_dim_reversed ? -1 : 1);
|
||||
|
||||
it[i].block_span = it[i].block_stride * (it[i].size - 1);
|
||||
it[i].input_span = it[i].input_stride * (it[i].size - 1);
|
||||
|
||||
effective_inner_dim = i;
|
||||
}
|
||||
|
||||
eigen_assert(it[effective_inner_dim].block_stride == 1);
|
||||
eigen_assert(it[effective_inner_dim].input_stride ==
|
||||
(inner_dim_reversed ? -1 : 1));
|
||||
|
||||
const Index inner_dim_size = it[effective_inner_dim].size;
|
||||
|
||||
while (it[NumDims - 1].count < it[NumDims - 1].size) {
|
||||
// Copy inner-most dimension data from reversed location in input.
|
||||
Index dst = block_offset;
|
||||
Index src = input_offset;
|
||||
|
||||
// NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
|
||||
// worse results in benchmarks than a simple coefficient loop.
|
||||
if (inner_dim_reversed) {
|
||||
for (Index i = 0; i < inner_dim_size; ++i) {
|
||||
data[dst] = m_impl.coeff(src);
|
||||
++dst;
|
||||
--src;
|
||||
}
|
||||
} else {
|
||||
for (Index i = 0; i < inner_dim_size; ++i) {
|
||||
data[dst] = m_impl.coeff(src);
|
||||
++dst;
|
||||
++src;
|
||||
}
|
||||
}
|
||||
|
||||
// For the 1d tensor we need to generate only one inner-most dimension.
|
||||
if ((NumDims - effective_inner_dim) == 1) break;
|
||||
|
||||
// Update offset.
|
||||
for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
|
||||
if (++it[i].count < it[i].size) {
|
||||
block_offset += it[i].block_stride;
|
||||
input_offset += it[i].input_stride;
|
||||
break;
|
||||
}
|
||||
if (i != NumDims - 1) it[i].count = 0;
|
||||
block_offset -= it[i].block_span;
|
||||
input_offset -= it[i].input_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
@ -535,7 +428,6 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -99,7 +99,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -125,11 +124,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||
@ -249,98 +243,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
internal::kUniformAllDims, block_total_size_max));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (m_impl.data() != NULL) {
|
||||
// Fast path: we have direct access to the data, so shuffle as we read.
|
||||
TensorBlockReader::Run(output_block,
|
||||
srcCoeff(output_block->first_coeff_index()),
|
||||
m_inverseShuffle,
|
||||
m_unshuffledInputStrides,
|
||||
m_impl.data());
|
||||
return;
|
||||
}
|
||||
|
||||
// Slow path: read unshuffled block from the input and shuffle in-place.
|
||||
// Initialize input block sizes using input-to-output shuffle map.
|
||||
DSizes<Index, NumDims> input_block_sizes;
|
||||
for (Index i = 0; i < NumDims; ++i) {
|
||||
input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
|
||||
}
|
||||
|
||||
// Calculate input block strides.
|
||||
DSizes<Index, NumDims> input_block_strides;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
input_block_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
input_block_strides[i] =
|
||||
input_block_strides[i - 1] * input_block_sizes[i - 1];
|
||||
}
|
||||
} else {
|
||||
input_block_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
input_block_strides[i] =
|
||||
input_block_strides[i + 1] * input_block_sizes[i + 1];
|
||||
}
|
||||
}
|
||||
DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
fast_input_block_strides[i] =
|
||||
internal::TensorIntDivisor<Index>(input_block_strides[i]);
|
||||
}
|
||||
|
||||
// Read input block.
|
||||
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||
input_block_sizes,
|
||||
input_block_strides,
|
||||
Dimensions(m_unshuffledInputStrides),
|
||||
output_block->data());
|
||||
|
||||
m_impl.block(&input_block);
|
||||
|
||||
// Naive In-place shuffle: random IO but block size is O(L1 cache size).
|
||||
// TODO(andydavis) Improve the performance of this in-place shuffle.
|
||||
const Index total_size = input_block_sizes.TotalSize();
|
||||
std::vector<bool> bitmap(total_size, false);
|
||||
ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
|
||||
const DSizes<Index, NumDims>& output_block_strides =
|
||||
output_block->block_strides();
|
||||
for (Index input_index = 0; input_index < total_size; ++input_index) {
|
||||
if (bitmap[input_index]) {
|
||||
// Coefficient at this index has already been shuffled.
|
||||
continue;
|
||||
}
|
||||
|
||||
Index output_index =
|
||||
GetBlockOutputIndex(input_index, input_block_strides,
|
||||
output_block_strides, fast_input_block_strides);
|
||||
if (output_index == input_index) {
|
||||
// Coefficient already in place.
|
||||
bitmap[output_index] = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// The following loop starts at 'input_index', and shuffles
|
||||
// coefficients into their shuffled location at 'output_index'.
|
||||
// It skips through the array shuffling coefficients by following
|
||||
// the shuffle cycle starting and ending a 'start_index'.
|
||||
ScalarNoConst evicted_value;
|
||||
ScalarNoConst shuffled_value = data[input_index];
|
||||
do {
|
||||
evicted_value = data[output_index];
|
||||
data[output_index] = shuffled_value;
|
||||
shuffled_value = evicted_value;
|
||||
bitmap[output_index] = true;
|
||||
output_index =
|
||||
GetBlockOutputIndex(output_index, input_block_strides,
|
||||
output_block_strides, fast_input_block_strides);
|
||||
} while (output_index != input_index);
|
||||
|
||||
data[output_index] = shuffled_value;
|
||||
bitmap[output_index] = true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool root_of_expr_ast = false) const {
|
||||
@ -462,7 +364,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||
PreferBlockAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -471,11 +372,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||
|
||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||
//===--------------------------------------------------------------------===//
|
||||
@ -502,15 +398,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||
const TensorBlock& block) {
|
||||
eigen_assert(this->m_impl.data() != NULL);
|
||||
TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
|
||||
this->m_inverseShuffle,
|
||||
this->m_unshuffledInputStrides, this->m_impl.data());
|
||||
}
|
||||
|
||||
template <typename TensorBlockV2>
|
||||
template <typename TensorBlockV2>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||
eigen_assert(this->m_impl.data() != NULL);
|
||||
|
@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
@ -288,7 +287,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
PreferBlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
|
@ -97,7 +97,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -183,7 +183,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
BlockAccessV2 = false,
|
||||
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
|
@ -46,22 +46,6 @@ static DSizes<Index, NumDims> RandomDims() {
|
||||
return DSizes<Index, NumDims>(dims);
|
||||
}
|
||||
|
||||
/** Dummy data type to test TensorBlock copy ops. */
|
||||
struct Data {
|
||||
Data() : value(0) {}
|
||||
explicit Data(int v) : value(v) { }
|
||||
int value;
|
||||
};
|
||||
|
||||
bool operator==(const Data& lhs, const Data& rhs) {
|
||||
return lhs.value == rhs.value;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const Data& d) {
|
||||
os << "Data: value=" << d.value;
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T* GenerateRandomData(const Index& size) {
|
||||
T* data = new T[size];
|
||||
@ -71,15 +55,6 @@ static T* GenerateRandomData(const Index& size) {
|
||||
return data;
|
||||
}
|
||||
|
||||
template <>
|
||||
Data* GenerateRandomData(const Index& size) {
|
||||
Data* data = new Data[size];
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = Data(internal::random<int>(1, 100));
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
template <int NumDims>
|
||||
static void Debug(DSizes<Index, NumDims> dims) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
@ -183,84 +158,6 @@ static void test_block_mapper_maps_every_element() {
|
||||
VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_slice_block_mapper_maps_every_element() {
|
||||
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||
typedef internal::TensorSliceBlockMapper<T, Index, NumDims, Layout> TensorSliceBlockMapper;
|
||||
|
||||
DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
|
||||
|
||||
// Make sure that tensor offsets + extents do not overflow.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_slice_offsets[i] =
|
||||
numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
|
||||
tensor_slice_extents[i] = numext::mini(
|
||||
tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
|
||||
}
|
||||
|
||||
// Keep track of elements indices available via block access.
|
||||
std::set<Index> coeff_set;
|
||||
|
||||
int total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
|
||||
|
||||
// Pick a random dimension sizes for the tensor blocks.
|
||||
DSizes<Index, NumDims> block_sizes;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
block_sizes[i] = internal::random<Index>(1, tensor_slice_extents[i]);
|
||||
}
|
||||
|
||||
TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
|
||||
tensor_slice_extents, block_sizes,
|
||||
DimensionList<Index, NumDims>());
|
||||
|
||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
|
||||
UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
|
||||
choose(Layout, NumDims - 1, 0),
|
||||
&coeff_set);
|
||||
}
|
||||
|
||||
VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_io_copy_data_from_source_to_target() {
|
||||
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||
TensorBlockMapper;
|
||||
|
||||
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||
const Index input_tensor_size = input_tensor_dims.TotalSize();
|
||||
|
||||
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||
T* output_data = new T[input_tensor_size];
|
||||
|
||||
TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
|
||||
RandomTargetSize(input_tensor_dims));
|
||||
T* block_data = new T[block_mapper.block_dims_total_size()];
|
||||
|
||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||
TensorBlockReader::Run(&block, input_data);
|
||||
TensorBlockWriter::Run(block, output_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_tensor_size; ++i) {
|
||||
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] output_data;
|
||||
delete[] block_data;
|
||||
}
|
||||
|
||||
template <int Layout, int NumDims>
|
||||
static Index GetInputIndex(Index output_index,
|
||||
const array<Index, NumDims>& output_to_input_dim_map,
|
||||
@ -304,179 +201,6 @@ static array<Index, NumDims> ComputeStrides(
|
||||
return strides;
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_io_copy_using_reordered_dimensions() {
|
||||
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||
TensorBlockMapper;
|
||||
|
||||
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||
const Index input_tensor_size = input_tensor_dims.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||
|
||||
// Create a random dimension re-ordering/shuffle.
|
||||
std::vector<Index> shuffle;
|
||||
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||
std::random_shuffle(shuffle.begin(), shuffle.end());
|
||||
|
||||
DSizes<Index, NumDims> output_tensor_dims;
|
||||
array<Index, NumDims> input_to_output_dim_map;
|
||||
array<Index, NumDims> output_to_input_dim_map;
|
||||
for (Index i = 0; i < NumDims; ++i) {
|
||||
output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
|
||||
input_to_output_dim_map[i] = shuffle[i];
|
||||
output_to_input_dim_map[shuffle[i]] = i;
|
||||
}
|
||||
|
||||
// Random block shape and size.
|
||||
TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
|
||||
RandomTargetSize(input_tensor_dims));
|
||||
|
||||
T* block_data = new T[block_mapper.block_dims_total_size()];
|
||||
T* output_data = new T[input_tensor_size];
|
||||
|
||||
array<Index, NumDims> input_tensor_strides =
|
||||
ComputeStrides<Layout, NumDims>(input_tensor_dims);
|
||||
array<Index, NumDims> output_tensor_strides =
|
||||
ComputeStrides<Layout, NumDims>(output_tensor_dims);
|
||||
|
||||
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||
const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
|
||||
block.first_coeff_index(), output_to_input_dim_map,
|
||||
input_tensor_strides, output_tensor_strides);
|
||||
TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
|
||||
input_tensor_strides, input_data);
|
||||
TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
|
||||
input_tensor_strides, output_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_tensor_size; ++i) {
|
||||
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] block_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
// This is the special case for reading data with reordering, when dimensions
|
||||
// before/after reordering are the same. Squeezing reads along inner dimensions
|
||||
// in this case is illegal, because we reorder innermost dimension.
|
||||
template <int Layout>
|
||||
static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze()
|
||||
{
|
||||
typedef internal::TensorBlock<float, Index, 3, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 3, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
DSizes<Index, 3> tensor_dims;
|
||||
tensor_dims[0] = 7;
|
||||
tensor_dims[1] = 9;
|
||||
tensor_dims[2] = 7;
|
||||
|
||||
DSizes<Index, 3> block_dims = tensor_dims;
|
||||
|
||||
DSizes<Index, 3> tensor_to_block_dim_map;
|
||||
tensor_to_block_dim_map[0] = 2;
|
||||
tensor_to_block_dim_map[1] = 1;
|
||||
tensor_to_block_dim_map[2] = 0;
|
||||
|
||||
DSizes<Index, 3> tensor_strides(ComputeStrides<Layout, 3>(tensor_dims));
|
||||
DSizes<Index, 3> block_strides(ComputeStrides<Layout, 3>(block_dims));
|
||||
|
||||
const Index tensor_size = tensor_dims.TotalSize();
|
||||
float* tensor_data = GenerateRandomData<float>(tensor_size);
|
||||
float* block_data = new float[tensor_size];
|
||||
|
||||
TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
|
||||
TensorBlockReader::Run(&block,
|
||||
0,
|
||||
tensor_to_block_dim_map,
|
||||
tensor_strides,
|
||||
tensor_data);
|
||||
|
||||
TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
|
||||
TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
|
||||
|
||||
for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
|
||||
for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
|
||||
for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
|
||||
float block_value = block_tensor(d2, d1, d0);
|
||||
float tensor_value = tensor_tensor(d0, d1, d2);
|
||||
VERIFY_IS_EQUAL(block_value, tensor_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] block_data;
|
||||
delete[] tensor_data;
|
||||
}
|
||||
|
||||
// This is the special case for reading data with reordering, when dimensions
|
||||
// before/after reordering are the same. Squeezing reads in this case is allowed
|
||||
// because we reorder outer dimensions.
|
||||
template <int Layout>
|
||||
static void test_block_io_copy_using_reordered_dimensions_squeeze()
|
||||
{
|
||||
typedef internal::TensorBlock<float, Index, 4, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 4, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
DSizes<Index, 4> tensor_dims;
|
||||
tensor_dims[0] = 7;
|
||||
tensor_dims[1] = 5;
|
||||
tensor_dims[2] = 9;
|
||||
tensor_dims[3] = 9;
|
||||
|
||||
DSizes<Index, 4> block_dims = tensor_dims;
|
||||
|
||||
DSizes<Index, 4> tensor_to_block_dim_map;
|
||||
tensor_to_block_dim_map[0] = 0;
|
||||
tensor_to_block_dim_map[1] = 1;
|
||||
tensor_to_block_dim_map[2] = 3;
|
||||
tensor_to_block_dim_map[3] = 2;
|
||||
|
||||
DSizes<Index, 4> tensor_strides(ComputeStrides<Layout, 4>(tensor_dims));
|
||||
DSizes<Index, 4> block_strides(ComputeStrides<Layout, 4>(block_dims));
|
||||
|
||||
const Index tensor_size = tensor_dims.TotalSize();
|
||||
float* tensor_data = GenerateRandomData<float>(tensor_size);
|
||||
float* block_data = new float[tensor_size];
|
||||
|
||||
TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
|
||||
TensorBlockReader::Run(&block,
|
||||
0,
|
||||
tensor_to_block_dim_map,
|
||||
tensor_strides,
|
||||
tensor_data);
|
||||
|
||||
TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
|
||||
TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
|
||||
|
||||
for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
|
||||
for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
|
||||
for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
|
||||
for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
|
||||
float block_value = block_tensor(d0, d1, d3, d2);
|
||||
float tensor_value = tensor_tensor(d0, d1, d2, d3);
|
||||
VERIFY_IS_EQUAL(block_value, tensor_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] block_data;
|
||||
delete[] tensor_data;
|
||||
}
|
||||
|
||||
template<typename Scalar, typename StorageIndex, int Dim>
|
||||
class EqualityChecker
|
||||
{
|
||||
@ -510,365 +234,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_io_zero_stride()
|
||||
{
|
||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||
|
||||
DSizes<Index, 5> input_tensor_dims = rnd_dims;
|
||||
input_tensor_dims[0] = 1;
|
||||
input_tensor_dims[2] = 1;
|
||||
input_tensor_dims[4] = 1;
|
||||
const Index input_tensor_size = input_tensor_dims.TotalSize();
|
||||
float* input_data = GenerateRandomData<float>(input_tensor_size);
|
||||
|
||||
DSizes<Index, 5> output_tensor_dims = rnd_dims;
|
||||
|
||||
DSizes<Index, 5> input_tensor_strides(
|
||||
ComputeStrides<Layout, 5>(input_tensor_dims));
|
||||
DSizes<Index, 5> output_tensor_strides(
|
||||
ComputeStrides<Layout, 5>(output_tensor_dims));
|
||||
|
||||
DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
|
||||
input_tensor_strides_with_zeros[0] = 0;
|
||||
input_tensor_strides_with_zeros[2] = 0;
|
||||
input_tensor_strides_with_zeros[4] = 0;
|
||||
|
||||
// Verify that data was correctly read/written from/into the block.
|
||||
const EqualityChecker<float, Index, 5> verify_is_equal(input_data, input_tensor_dims, input_tensor_strides, output_tensor_dims, output_tensor_strides);
|
||||
|
||||
{
|
||||
float* output_data = new float[output_tensor_dims.TotalSize()];
|
||||
TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
|
||||
input_tensor_strides_with_zeros, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
verify_is_equal(output_data);
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
float* output_data = new float[output_tensor_dims.TotalSize()];
|
||||
TensorBlock write_block(0, output_tensor_dims,
|
||||
input_tensor_strides_with_zeros,
|
||||
output_tensor_strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
verify_is_equal(output_data);
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_io_squeeze_ones() {
|
||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
// Total size > 1.
|
||||
{
|
||||
DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
float* input_data = GenerateRandomData<float>(total_size);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
{
|
||||
float* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
float* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
}
|
||||
|
||||
// Total size == 1.
|
||||
{
|
||||
DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
float* input_data = GenerateRandomData<float>(total_size);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
{
|
||||
float* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
float* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_cwise_unary_io_basic() {
|
||||
typedef internal::scalar_square_op<T> UnaryFunctor;
|
||||
typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
|
||||
Layout>
|
||||
TensorBlockCwiseUnaryIO;
|
||||
|
||||
DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
|
||||
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
T* input_data = GenerateRandomData<T>(total_size);
|
||||
|
||||
T* output_data = new T[total_size];
|
||||
UnaryFunctor functor;
|
||||
TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_unary_io_squeeze_ones() {
|
||||
typedef internal::scalar_square_op<float> UnaryFunctor;
|
||||
typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseUnaryIO;
|
||||
|
||||
DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
float* input_data = GenerateRandomData<float>(total_size);
|
||||
|
||||
float* output_data = new float[total_size];
|
||||
UnaryFunctor functor;
|
||||
TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_unary_io_zero_strides() {
|
||||
typedef internal::scalar_square_op<float> UnaryFunctor;
|
||||
typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseUnaryIO;
|
||||
|
||||
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||
|
||||
DSizes<Index, 5> input_sizes = rnd_dims;
|
||||
input_sizes[0] = 1;
|
||||
input_sizes[2] = 1;
|
||||
input_sizes[4] = 1;
|
||||
|
||||
DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
|
||||
input_strides[0] = 0;
|
||||
input_strides[2] = 0;
|
||||
input_strides[4] = 0;
|
||||
|
||||
// Generate random data.
|
||||
float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
|
||||
|
||||
DSizes<Index, 5> output_sizes = rnd_dims;
|
||||
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
|
||||
|
||||
const Index output_total_size = output_sizes.TotalSize();
|
||||
float* output_data = new float[output_total_size];
|
||||
|
||||
UnaryFunctor functor;
|
||||
TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
|
||||
output_data, input_strides, input_data);
|
||||
for (int i = 0; i < rnd_dims[0]; ++i) {
|
||||
for (int j = 0; j < rnd_dims[1]; ++j) {
|
||||
for (int k = 0; k < rnd_dims[2]; ++k) {
|
||||
for (int l = 0; l < rnd_dims[3]; ++l) {
|
||||
for (int m = 0; m < rnd_dims[4]; ++m) {
|
||||
Index output_index = i * output_strides[0] + j * output_strides[1] +
|
||||
k * output_strides[2] + l * output_strides[3] +
|
||||
m * output_strides[4];
|
||||
Index input_index = i * input_strides[0] + j * input_strides[1] +
|
||||
k * input_strides[2] + l * input_strides[3] +
|
||||
m * input_strides[4];
|
||||
VERIFY_IS_EQUAL(output_data[output_index],
|
||||
functor(input_data[input_index]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_cwise_binary_io_basic() {
|
||||
typedef internal::scalar_sum_op<T> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
|
||||
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
T* left_data = GenerateRandomData<T>(total_size);
|
||||
T* right_data = GenerateRandomData<T>(total_size);
|
||||
|
||||
T* output_data = new T[total_size];
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, left_data, strides, right_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_binary_io_squeeze_ones() {
|
||||
typedef internal::scalar_sum_op<float> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
const Index total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
float* left_data = GenerateRandomData<float>(total_size);
|
||||
float* right_data = GenerateRandomData<float>(total_size);
|
||||
|
||||
float* output_data = new float[total_size];
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, left_data, strides, right_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_binary_io_zero_strides() {
|
||||
typedef internal::scalar_sum_op<float> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||
|
||||
DSizes<Index, 5> left_sizes = rnd_dims;
|
||||
left_sizes[0] = 1;
|
||||
left_sizes[2] = 1;
|
||||
left_sizes[4] = 1;
|
||||
|
||||
DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
|
||||
left_strides[0] = 0;
|
||||
left_strides[2] = 0;
|
||||
left_strides[4] = 0;
|
||||
|
||||
DSizes<Index, 5> right_sizes = rnd_dims;
|
||||
right_sizes[1] = 1;
|
||||
right_sizes[3] = 1;
|
||||
|
||||
DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
|
||||
right_strides[1] = 0;
|
||||
right_strides[3] = 0;
|
||||
|
||||
// Generate random data.
|
||||
float* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
|
||||
float* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
|
||||
|
||||
DSizes<Index, 5> output_sizes = rnd_dims;
|
||||
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
|
||||
|
||||
const Index output_total_size = output_sizes.TotalSize();
|
||||
float* output_data = new float[output_total_size];
|
||||
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
|
||||
output_data, left_strides, left_data,
|
||||
right_strides, right_data);
|
||||
for (int i = 0; i < rnd_dims[0]; ++i) {
|
||||
for (int j = 0; j < rnd_dims[1]; ++j) {
|
||||
for (int k = 0; k < rnd_dims[2]; ++k) {
|
||||
for (int l = 0; l < rnd_dims[3]; ++l) {
|
||||
for (int m = 0; m < rnd_dims[4]; ++m) {
|
||||
Index output_index = i * output_strides[0] + j * output_strides[1] +
|
||||
k * output_strides[2] + l * output_strides[3] +
|
||||
m * output_strides[4];
|
||||
Index left_index = i * left_strides[0] + j * left_strides[1] +
|
||||
k * left_strides[2] + l * left_strides[3] +
|
||||
m * left_strides[4];
|
||||
Index right_index = i * right_strides[0] + j * right_strides[1] +
|
||||
k * right_strides[2] + l * right_strides[3] +
|
||||
m * right_strides[4];
|
||||
VERIFY_IS_EQUAL(
|
||||
output_data[output_index],
|
||||
functor(left_data[left_index], right_data[right_index]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_uniform_block_shape()
|
||||
{
|
||||
@ -1196,21 +561,6 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
|
||||
TEST_LAYOUTS(test_block_mapper_sanity);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
|
||||
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
|
||||
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
|
||||
TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_do_not_squeeze);
|
||||
TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_squeeze);
|
||||
TEST_LAYOUTS(test_block_io_zero_stride);
|
||||
TEST_LAYOUTS(test_block_io_squeeze_ones);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
|
||||
TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
|
||||
TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
|
||||
TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
|
||||
TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
|
||||
TEST_LAYOUTS(test_uniform_block_shape);
|
||||
TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
|
||||
TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kUniformAllDims);
|
||||
|
@ -310,48 +310,6 @@ static void test_execute_shuffle_lvalue(Device d)
|
||||
} while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_reduction(Device d)
|
||||
{
|
||||
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
||||
|
||||
static constexpr int ReducedDims = NumDims - 2;
|
||||
static constexpr int Options = 0 | Layout;
|
||||
|
||||
auto dims = RandomDims<NumDims>(5, 10);
|
||||
Tensor<T, NumDims, Options, Index> src(dims);
|
||||
src.setRandom();
|
||||
|
||||
// Pick two random and unique reduction dimensions.
|
||||
int reduction0 = internal::random<int>(0, NumDims - 1);
|
||||
int reduction1 = internal::random<int>(0, NumDims - 1);
|
||||
while (reduction0 == reduction1) {
|
||||
reduction1 = internal::random<int>(0, NumDims - 1);
|
||||
}
|
||||
|
||||
DSizes<Index, 2> reduction_axis;
|
||||
reduction_axis[0] = reduction0;
|
||||
reduction_axis[1] = reduction1;
|
||||
|
||||
Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
|
||||
|
||||
// Now do the reduction using configured tensor executor.
|
||||
Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
|
||||
|
||||
auto expr = src.sum(reduction_axis);
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||
TiledEvaluation Tiling, int Layout>
|
||||
static void test_execute_reshape(Device d)
|
||||
@ -663,57 +621,34 @@ static void test_async_execute_binary_expr(Device d)
|
||||
#define CALL_SUBTEST_PART(PART) \
|
||||
CALL_SUBTEST_##PART
|
||||
|
||||
#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS) \
|
||||
#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
|
||||
// NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
|
||||
#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
|
||||
|
||||
// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
|
||||
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
|
||||
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
Eigen::DefaultDevice default_device;
|
||||
@ -724,69 +659,64 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
Eigen::ThreadPool tp(num_threads);
|
||||
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 5);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 1);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 5);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
|
||||
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
|
||||
|
||||
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
|
||||
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
|
||||
|
Loading…
x
Reference in New Issue
Block a user