Merged in ezhulenev/eigen/tiling_3 (pull request PR-438)

Tiled tensor executor
This commit is contained in:
Gael Guennebaud 2018-07-31 08:13:00 +00:00
commit 678a0dcb12
33 changed files with 1989 additions and 246 deletions

View File

@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h" #include "src/Tensor/TensorBase.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContractionBlocking.h"

View File

@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
static const int NumDims = XprType::NumDims;
enum { enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, TensorEvaluator<RightArgType, Device>::IsAligned,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
TensorBlock;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device), m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device) m_rightImpl(op.rhsExpression(), device)
{ {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(
(static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
} }
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {
m_leftImpl.getResourceRequirements(resources);
m_rightImpl.getResourceRequirements(resources);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
m_leftImpl.data() != nullptr) {
TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
block->tensor_strides(), block->tensor_strides(),
m_leftImpl.data() + block->first_coeff_index());
m_rightImpl.block(&left_block);
} else {
m_rightImpl.block(block);
m_leftImpl.writeBlock(*block);
}
}
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor

View File

@ -14,6 +14,32 @@
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
namespace {
// Helper template to choose between ColMajor and RowMajor values.
template <int Layout>
struct cond;
template <>
struct cond<ColMajor> {
template <typename T>
EIGEN_STRONG_INLINE const T& operator()(const T& col,
const T& /*row*/) const {
return col;
}
};
template <>
struct cond<RowMajor> {
template <typename T>
EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/,
const T& row) const {
return row;
}
};
} // namespace
/** /**
* \class TensorBlockShapeType * \class TensorBlockShapeType
* \ingroup CXX11_Tensor_Module * \ingroup CXX11_Tensor_Module
@ -39,6 +65,40 @@ enum class TensorBlockShapeType {
kSkewedInnerDims, kSkewedInnerDims,
}; };
struct TensorOpResourceRequirements {
TensorBlockShapeType block_shape;
Index block_total_size;
// TODO(andydavis) Add 'target_num_threads' to support communication of
// thread-resource requirements. This will allow ops deep in the
// expression tree (like reductions) to communicate resources
// requirements based on local state (like the total number of reductions
// to be computed).
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
const Index size)
: block_shape(shape), block_total_size(size) {}
};
// Tries to merge multiple resource requirements.
EIGEN_STRONG_INLINE void MergeResourceRequirements(
const std::vector<TensorOpResourceRequirements>& resources,
TensorBlockShapeType* block_shape, Index* block_total_size) {
if (resources.empty()) {
return;
}
// TODO(andydavis) Implement different policies (i.e. revert to a default
// policy if block shapes/sizes conflict).
*block_shape = resources[0].block_shape;
*block_total_size = resources[0].block_total_size;
for (int i = 1; i < resources.size(); ++i) {
if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
*block_shape != TensorBlockShapeType::kSkewedInnerDims) {
*block_shape = TensorBlockShapeType::kSkewedInnerDims;
}
*block_total_size =
numext::maxi(*block_total_size, resources[i].block_total_size);
}
}
/** /**
* \class TensorBlock * \class TensorBlock
* \ingroup CXX11_Tensor_Module * \ingroup CXX11_Tensor_Module
@ -48,12 +108,12 @@ enum class TensorBlockShapeType {
* This class represents a tensor block specified by the index of the * This class represents a tensor block specified by the index of the
* first block coefficient, and the size of the block in each dimension. * first block coefficient, and the size of the block in each dimension.
*/ */
template <typename Scalar, typename Index, std::size_t NumDims, int Layout> template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlock { class TensorBlock {
public: public:
typedef DSizes<Index, NumDims> Dimensions; typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes, TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
const Dimensions& block_strides, const Dimensions& tensor_strides, const Dimensions& block_strides, const Dimensions& tensor_strides,
Scalar* data) Scalar* data)
: m_first_coeff_index(first_coeff_index), : m_first_coeff_index(first_coeff_index),
@ -62,7 +122,7 @@ class TensorBlock {
m_tensor_strides(tensor_strides), m_tensor_strides(tensor_strides),
m_data(data) {} m_data(data) {}
Index first_coeff_index() const { return m_first_coeff_index; } StorageIndex first_coeff_index() const { return m_first_coeff_index; }
const Dimensions& block_sizes() const { return m_block_sizes; } const Dimensions& block_sizes() const { return m_block_sizes; }
@ -75,13 +135,487 @@ class TensorBlock {
const Scalar* data() const { return m_data; } const Scalar* data() const { return m_data; }
private: private:
Index m_first_coeff_index; StorageIndex m_first_coeff_index;
Dimensions m_block_sizes; Dimensions m_block_sizes;
Dimensions m_block_strides; Dimensions m_block_strides;
Dimensions m_tensor_strides; Dimensions m_tensor_strides;
Scalar* m_data; // Not owned. Scalar* m_data; // Not owned.
}; };
template <typename Scalar, typename StorageIndex>
struct TensorBlockCopyOp {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
const StorageIndex src_index, const StorageIndex src_stride,
const Scalar* EIGEN_RESTRICT src_data) {
const Scalar* src_base = &src_data[src_index];
Scalar* dst_base = &dst_data[dst_index];
using Src = const Eigen::Array<Scalar, Dynamic, 1>;
using Dst = Eigen::Array<Scalar, Dynamic, 1>;
using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
dst = src;
}
};
/**
* \class TensorBlockIO
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block IO class.
*
* This class is responsible for copying data between a tensor and a tensor
* block.
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
bool BlockRead>
class TensorBlockIO {
public:
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock;
typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
TensorBlockCopyOp;
protected:
struct BlockIteratorState {
StorageIndex input_stride;
StorageIndex output_stride;
StorageIndex input_span;
StorageIndex output_span;
StorageIndex size;
StorageIndex count;
};
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
const TensorBlock& block, StorageIndex first_coeff_index,
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
Scalar* dst_data) {
// Find the innermost tensor dimension whose size is not 1. This is the
// effective inner dim. If all dimensions are of size 1, then fallback to
// using the actual innermost dim to avoid out-of-bound access.
StorageIndex num_size_one_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
num_size_one_inner_dims = i;
break;
}
}
// Calculate strides and dimensions.
const StorageIndex tensor_stride1_dim = cond<Layout>()(
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
const StorageIndex block_dim_for_tensor_stride1_dim =
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
size_t block_inner_dim_size =
NumDims == 0 ? 1
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
const StorageIndex block_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
if (block_inner_dim_size == block_stride &&
block_stride == tensor_strides[dim]) {
block_inner_dim_size *=
block.block_sizes()[tensor_to_block_dim_map[dim]];
++num_size_one_inner_dims;
} else {
break;
}
}
StorageIndex inputIndex;
StorageIndex outputIndex;
StorageIndex input_stride;
StorageIndex output_stride;
// Setup strides to read/write along the tensor's stride1 dimension.
if (BlockRead) {
inputIndex = first_coeff_index;
outputIndex = 0;
input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
output_stride =
NumDims == 0
? 1
: block.block_strides()[block_dim_for_tensor_stride1_dim];
} else {
inputIndex = 0;
outputIndex = first_coeff_index;
input_stride =
NumDims == 0
? 1
: block.block_strides()[block_dim_for_tensor_stride1_dim];
output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
}
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> block_iter_state;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int num_squeezed_dims = 0;
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
if (size == 1) {
continue;
}
block_iter_state[num_squeezed_dims].size = size;
if (BlockRead) {
block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
block_iter_state[num_squeezed_dims].output_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
} else {
block_iter_state[num_squeezed_dims].input_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
}
block_iter_state[num_squeezed_dims].input_span =
block_iter_state[num_squeezed_dims].input_stride *
(block_iter_state[num_squeezed_dims].size - 1);
block_iter_state[num_squeezed_dims].output_span =
block_iter_state[num_squeezed_dims].output_stride *
(block_iter_state[num_squeezed_dims].size - 1);
block_iter_state[num_squeezed_dims].count = 0;
++num_squeezed_dims;
}
// Iterate copying data from src to dst.
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
dst_data, inputIndex, input_stride, src_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
if (++block_iter_state[j].count < block_iter_state[j].size) {
inputIndex += block_iter_state[j].input_stride;
outputIndex += block_iter_state[j].output_stride;
break;
}
block_iter_state[j].count = 0;
inputIndex -= block_iter_state[j].input_span;
outputIndex -= block_iter_state[j].output_span;
}
}
}
};
/**
* \class TensorBlockReader
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block reader class.
*
* This class is responsible for reading a tensor block.
*
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
Layout, /*BlockRead=*/true> {
public:
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock;
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
Base;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
TensorBlock* block, const Scalar* src_data) {
array<StorageIndex, NumDims> tensor_to_block_dim_map;
for (int i = 0; i < NumDims; ++i) {
tensor_to_block_dim_map[i] = i;
}
Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
block->tensor_strides(), src_data, block->data());
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
TensorBlock* block, StorageIndex first_coeff_index,
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
tensor_strides, src_data, block->data());
}
};
/**
* \class TensorBlockWriter
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block writer class.
*
* This class is responsible for writing a tensor block.
*
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
Layout, /*BlockRead=*/false> {
public:
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock;
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
Base;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const TensorBlock& block, Scalar* dst_data) {
array<StorageIndex, NumDims> tensor_to_block_dim_map;
for (int i = 0; i < NumDims; ++i) {
tensor_to_block_dim_map[i] = i;
}
Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
block.tensor_strides(), block.data(), dst_data);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const TensorBlock& block, StorageIndex first_coeff_index,
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
tensor_strides, block.data(), dst_data);
}
};
/**
* \class TensorBlockCwiseBinaryOp
* \ingroup CXX11_Tensor_Module
*
* \brief Carries out a cwise binary op on a number of coefficients.
*
* This class reads strided inputs from left and right operands, and writes the
* result of the cwise binary op to the strided output array.
*
*/
struct TensorBlockCwiseBinaryOp {
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
typename LeftScalar, typename RightScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const BinaryFunctor& functor, const StorageIndex num_coeff,
const StorageIndex output_index, const StorageIndex output_stride,
OutputScalar* output_data, const StorageIndex left_index,
const StorageIndex left_stride, const LeftScalar* left_data,
const StorageIndex right_index, const StorageIndex right_stride,
const RightScalar* right_data) {
using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
const LeftScalar* lhs_base = &left_data[left_index];
const RightScalar* rhs_base = &right_data[right_index];
OutputScalar* out_base = &output_data[output_index];
const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
out =
Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
}
};
/**
* \class TensorBlockCwiseBinaryIO
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block IO class for carrying out cwise binary ops.
*
* This class carries out the binary op on given blocks.
*
*/
template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
int NumDims, int Layout>
struct TensorBlockCwiseBinaryIO {
typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
Layout>::Dimensions Dimensions;
struct BlockIteratorState {
StorageIndex output_stride, output_span;
StorageIndex left_stride, left_span;
StorageIndex right_stride, right_span;
StorageIndex size, count;
};
template <typename LeftScalar, typename RightScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const BinaryFunctor& functor, const Dimensions& block_sizes,
const Dimensions& block_strides, OutputScalar* output_data,
const array<StorageIndex, NumDims>& left_strides,
const LeftScalar* left_data,
const array<StorageIndex, NumDims>& right_strides,
const RightScalar* right_data) {
// Find the innermost dimension whose size is not 1. This is the effective
// inner dim. If all dimensions are of size 1, fallback to using the actual
// innermost dim to avoid out-of-bound access.
int num_size_one_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_sizes[dim] != 1) {
num_size_one_inner_dims = i;
break;
}
}
// Calculate strides and dimensions.
const int inner_dim =
NumDims == 0 ? 1
: cond<Layout>()(num_size_one_inner_dims,
NumDims - num_size_one_inner_dims - 1);
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
// Merge multiple inner dims into one for larger inner dim size (i.e.
// fewer calls to TensorBlockCwiseBinaryOp::Run()).
if (inner_dim_size == block_strides[dim] &&
block_strides[dim] == left_strides[dim] &&
block_strides[dim] == right_strides[dim]) {
inner_dim_size *= block_sizes[dim];
++num_size_one_inner_dims;
} else {
break;
}
}
StorageIndex output_index = 0, left_index = 0, right_index = 0;
const StorageIndex output_stride =
NumDims == 0 ? 1 : block_strides[inner_dim];
const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
const StorageIndex right_stride =
NumDims == 0 ? 1 : right_strides[inner_dim];
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> block_iter_state;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int num_squeezed_dims = 0;
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
const StorageIndex size = block_sizes[dim];
if (size == 1) {
continue;
}
auto& state = block_iter_state[num_squeezed_dims];
state.output_stride = block_strides[dim];
state.left_stride = left_strides[dim];
state.right_stride = right_strides[dim];
state.size = size;
state.output_span = state.output_stride * (size - 1);
state.left_span = state.left_stride * (size - 1);
state.right_span = state.right_stride * (size - 1);
state.count = 0;
++num_squeezed_dims;
}
// Compute cwise binary op.
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block_sizes.TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
output_stride, output_data, left_index,
left_stride, left_data, right_index,
right_stride, right_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
auto& state = block_iter_state[j];
if (++state.count < state.size) {
output_index += state.output_stride;
left_index += state.left_stride;
right_index += state.right_stride;
break;
}
state.count = 0;
output_index -= state.output_span;
left_index -= state.left_span;
right_index -= state.right_span;
}
}
}
};
/**
* \class TensorBlockView
* \ingroup CXX11_Tensor_Module
*
* \brief Read-only view into a block of data.
*
* This class provides read-only access to a block of data in impl. It may need
* to allocate space for holding the intermediate result.
*
*/
template <class ArgType, class Device>
struct TensorBlockView {
typedef TensorEvaluator<ArgType, Device> Impl;
typedef typename Impl::Index StorageIndex;
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
static const int NumDims = array_size<typename Impl::Dimensions>::value;
typedef DSizes<StorageIndex, NumDims> Dimensions;
// Constructs a TensorBlockView for `impl`. `block` is only used for for
// specifying the start offset, shape, and strides of the block.
template <typename OtherTensorBlock>
TensorBlockView(const Device& device,
const TensorEvaluator<ArgType, Device>& impl,
const OtherTensorBlock& block)
: m_device(device),
m_block_sizes(block.block_sizes()),
m_data(NULL),
m_allocated_data(NULL) {
if (Impl::RawAccess && impl.data() != NULL) {
m_data = impl.data() + block.first_coeff_index();
m_block_strides = block.tensor_strides();
} else {
// Actually make a copy.
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
// Consider allowing ops to request additional temporary block memory in
// TensorOpResourceRequirements.
m_allocated_data = static_cast<Scalar*>(
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
m_data = m_allocated_data;
if (NumDims > 0) {
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
m_block_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
}
} else {
m_block_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
}
}
}
TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
block.first_coeff_index(), m_block_sizes, m_block_strides,
block.tensor_strides(), m_allocated_data);
impl.block(&input_block);
}
}
~TensorBlockView() {
if (m_allocated_data != NULL) {
m_device.deallocate(m_allocated_data);
}
}
const Dimensions& block_sizes() const { return m_block_sizes; }
const Dimensions& block_strides() const { return m_block_strides; }
const Scalar* data() const { return m_data; }
private:
const Device& m_device;
Dimensions m_block_sizes, m_block_strides;
const Scalar* m_data; // Not owned.
Scalar* m_allocated_data; // Owned.
};
/** /**
* \class TensorBlockMapper * \class TensorBlockMapper
* \ingroup CXX11_Tensor_Module * \ingroup CXX11_Tensor_Module
@ -90,21 +624,21 @@ class TensorBlock {
* *
* This class is responsible for iterating over the blocks of a tensor. * This class is responsible for iterating over the blocks of a tensor.
*/ */
template <typename Scalar, typename Index, std::size_t NumDims, int Layout> template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockMapper { class TensorBlockMapper {
public: public:
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock; TensorBlock;
typedef DSizes<Index, NumDims> Dimensions; typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorBlockMapper(const Dimensions& dims, TensorBlockMapper(const Dimensions& dims,
const TensorBlockShapeType block_shape, const TensorBlockShapeType block_shape,
size_t min_target_size) Index min_target_size)
: m_dimensions(dims), : m_dimensions(dims),
m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) { m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
// Calculate block counts by dimension and total block count. // Calculate block counts by dimension and total block count.
DSizes<Index, NumDims> block_count; DSizes<StorageIndex, NumDims> block_count;
for (size_t i = 0; i < block_count.rank(); ++i) { for (Index i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]); block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
} }
m_total_block_count = array_prod(block_count); m_total_block_count = array_prod(block_count);
@ -130,15 +664,15 @@ class TensorBlockMapper {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
GetBlockForIndex(Index block_index, Scalar* data) const { GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
Index first_coeff_index = 0; StorageIndex first_coeff_index = 0;
DSizes<Index, NumDims> coords; DSizes<StorageIndex, NumDims> coords;
DSizes<Index, NumDims> sizes; DSizes<StorageIndex, NumDims> sizes;
DSizes<Index, NumDims> strides; DSizes<StorageIndex, NumDims> strides;
if (NumDims > 0) { if (NumDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) { for (int i = NumDims - 1; i > 0; --i) {
const Index idx = block_index / m_block_strides[i]; const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i]; coords[i] = idx * m_block_dim_sizes[i];
sizes[i] = sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@ -156,7 +690,7 @@ class TensorBlockMapper {
} }
} else { } else {
for (int i = 0; i < NumDims - 1; ++i) { for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = block_index / m_block_strides[i]; const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i]; coords[i] = idx * m_block_dim_sizes[i];
sizes[i] = sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@ -181,23 +715,20 @@ class TensorBlockMapper {
data); data);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count; return m_total_block_count;
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
block_dims_total_size() const {
return m_block_dim_sizes.TotalSize(); return m_block_dim_sizes.TotalSize();
} }
private: private:
static int InnerDimIndex(Index i) {
return Layout == static_cast<int>(ColMajor) ? i : NumDims - i - 1;
}
static Dimensions BlockDimensions(const Dimensions& tensor_dims, static Dimensions BlockDimensions(const Dimensions& tensor_dims,
const TensorBlockShapeType block_shape, const TensorBlockShapeType block_shape,
size_t min_target_size) { Index min_target_size) {
min_target_size = numext::maxi<size_t>(1, min_target_size); min_target_size = numext::maxi<Index>(1, min_target_size);
// If tensor fully fits into the target size, we'll treat it a single block. // If tensor fully fits into the target size, we'll treat it a single block.
Dimensions block_dim_sizes = tensor_dims; Dimensions block_dim_sizes = tensor_dims;
@ -226,14 +757,14 @@ class TensorBlockMapper {
dim_size_target, static_cast<size_t>(tensor_dims[i])); dim_size_target, static_cast<size_t>(tensor_dims[i]));
} }
// Add any un-allocated coefficients to inner dimension(s). // Add any un-allocated coefficients to inner dimension(s).
Index total_size = block_dim_sizes.TotalSize(); StorageIndex total_size = block_dim_sizes.TotalSize();
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
const int dim = InnerDimIndex(i); const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_dim_sizes[dim] < tensor_dims[dim]) { if (block_dim_sizes[dim] < tensor_dims[dim]) {
const Index total_size_other_dims = const StorageIndex total_size_other_dims =
total_size / block_dim_sizes[dim]; total_size / block_dim_sizes[dim];
const Index alloc_avail = const StorageIndex alloc_avail =
divup<Index>(min_target_size, total_size_other_dims); divup<StorageIndex>(min_target_size, total_size_other_dims);
if (alloc_avail == block_dim_sizes[dim]) { if (alloc_avail == block_dim_sizes[dim]) {
// Insufficient excess coefficients to allocate. // Insufficient excess coefficients to allocate.
break; break;
@ -243,14 +774,14 @@ class TensorBlockMapper {
} }
} }
} else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) { } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
Index coeff_to_allocate = min_target_size; StorageIndex coeff_to_allocate = min_target_size;
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
const int dim = InnerDimIndex(i); const int dim = cond<Layout>()(i, NumDims - i - 1);
block_dim_sizes[dim] = block_dim_sizes[dim] =
numext::mini(coeff_to_allocate, tensor_dims[dim]); numext::mini(coeff_to_allocate, tensor_dims[dim]);
coeff_to_allocate = coeff_to_allocate = divup(
divup(coeff_to_allocate, coeff_to_allocate,
numext::maxi(static_cast<Index>(1), block_dim_sizes[dim])); numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
} }
eigen_assert(coeff_to_allocate == 1); eigen_assert(coeff_to_allocate == 1);
} else { } else {
@ -269,7 +800,7 @@ class TensorBlockMapper {
Dimensions m_block_dim_sizes; Dimensions m_block_dim_sizes;
Dimensions m_block_strides; Dimensions m_block_strides;
Dimensions m_tensor_strides; Dimensions m_tensor_strides;
Index m_total_block_count; StorageIndex m_total_block_count;
}; };
/** /**
@ -284,12 +815,12 @@ class TensorBlockMapper {
* processed together. * processed together.
* *
*/ */
template <typename Scalar, typename Index, std::size_t NumDims, int Layout> template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorSliceBlockMapper { class TensorSliceBlockMapper {
public: public:
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock; TensorBlock;
typedef DSizes<Index, NumDims> Dimensions; typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorSliceBlockMapper(const Dimensions& tensor_dims, TensorSliceBlockMapper(const Dimensions& tensor_dims,
const Dimensions& tensor_slice_offsets, const Dimensions& tensor_slice_offsets,
@ -303,7 +834,7 @@ class TensorSliceBlockMapper {
m_block_stride_order(block_stride_order), m_block_stride_order(block_stride_order),
m_total_block_count(1) { m_total_block_count(1) {
// Calculate block counts by dimension and total block count. // Calculate block counts by dimension and total block count.
DSizes<Index, NumDims> block_count; DSizes<StorageIndex, NumDims> block_count;
for (size_t i = 0; i < block_count.rank(); ++i) { for (size_t i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]); block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
} }
@ -330,11 +861,11 @@ class TensorSliceBlockMapper {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
GetBlockForIndex(Index block_index, Scalar* data) const { GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
Index first_coeff_index = 0; StorageIndex first_coeff_index = 0;
DSizes<Index, NumDims> coords; DSizes<StorageIndex, NumDims> coords;
DSizes<Index, NumDims> sizes; DSizes<StorageIndex, NumDims> sizes;
DSizes<Index, NumDims> strides; DSizes<StorageIndex, NumDims> strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) { for (int i = NumDims - 1; i > 0; --i) {
const Index idx = block_index / m_block_strides[i]; const Index idx = block_index / m_block_strides[i];
@ -352,16 +883,16 @@ class TensorSliceBlockMapper {
m_block_dim_sizes[0]); m_block_dim_sizes[0]);
first_coeff_index += coords[0] * m_tensor_strides[0]; first_coeff_index += coords[0] * m_tensor_strides[0];
Index prev_dim = m_block_stride_order[0]; StorageIndex prev_dim = m_block_stride_order[0];
strides[prev_dim] = 1; strides[prev_dim] = 1;
for (int i = 1; i < NumDims; ++i) { for (int i = 1; i < NumDims; ++i) {
const Index curr_dim = m_block_stride_order[i]; const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim; prev_dim = curr_dim;
} }
} else { } else {
for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = block_index / m_block_strides[i]; const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
sizes[i] = numext::mini( sizes[i] = numext::mini(
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
@ -377,10 +908,10 @@ class TensorSliceBlockMapper {
m_block_dim_sizes[NumDims - 1]); m_block_dim_sizes[NumDims - 1]);
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
Index prev_dim = m_block_stride_order[NumDims - 1]; StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
strides[prev_dim] = 1; strides[prev_dim] = 1;
for (int i = NumDims - 2; i >= 0; --i) { for (int i = NumDims - 2; i >= 0; --i) {
const Index curr_dim = m_block_stride_order[i]; const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim; prev_dim = curr_dim;
} }
@ -390,7 +921,7 @@ class TensorSliceBlockMapper {
data); data);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count; return m_total_block_count;
} }
@ -402,7 +933,7 @@ class TensorSliceBlockMapper {
Dimensions m_block_dim_sizes; Dimensions m_block_dim_sizes;
Dimensions m_block_stride_order; Dimensions m_block_stride_order;
Dimensions m_block_strides; Dimensions m_block_strides;
Index m_total_block_count; StorageIndex m_total_block_count;
}; };
} // namespace internal } // namespace internal

View File

@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
// slice offsets. // slice offsets.
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };
@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = true RawAccess = true

View File

@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = true, PacketAccess = true,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout, Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D(
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) { const int maxX, const int kernelSize, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D(
const int maxX, const int numY, const int maxY, const int kernelSizeX, const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) { const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) { const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
} }
} }
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#else
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#endif
#if EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {

View File

@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = true RawAccess = true

View File

@ -41,11 +41,22 @@ struct TensorEvaluator
enum { enum {
IsAligned = Derived::IsAligned, IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout, Layout = Derived::Layout,
CoordAccess = NumCoords > 0, CoordAccess = NumCoords > 0,
RawAccess = true RawAccess = true
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlock;
typedef typename internal::TensorBlockReader<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockReader;
typedef typename internal::TensorBlockWriter<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockWriter;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
{ } { }
@ -113,6 +124,20 @@ struct TensorEvaluator
internal::unpacket_traits<PacketReturnType>::size); internal::unpacket_traits<PacketReturnType>::size);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
assert(m_data != NULL);
TensorBlockReader::Run(block, m_data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
const TensorBlock& block) {
assert(m_data != NULL);
TensorBlockWriter::Run(block, m_data);
}
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; } EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
/// required by sycl in order to construct sycl buffer from raw pointer /// required by sycl in order to construct sycl buffer from raw pointer
@ -167,11 +192,19 @@ struct TensorEvaluator<const Derived, Device>
enum { enum {
IsAligned = Derived::IsAligned, IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout, Layout = Derived::Layout,
CoordAccess = NumCoords > 0, CoordAccess = NumCoords > 0,
RawAccess = true RawAccess = true
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlock;
typedef typename internal::TensorBlockReader<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockReader;
// Used for accessor extraction in SYCL Managed TensorMap: // Used for accessor extraction in SYCL Managed TensorMap:
const Derived& derived() const { return m_impl; } const Derived& derived() const { return m_impl; }
@ -219,6 +252,14 @@ struct TensorEvaluator<const Derived, Device>
internal::unpacket_traits<PacketReturnType>::size); internal::unpacket_traits<PacketReturnType>::size);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
assert(m_data != NULL);
TensorBlockReader::Run(block, m_data);
}
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; } EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
/// added for sycl in order to construct the buffer from the sycl device /// added for sycl in order to construct the buffer from the sycl device
@ -244,6 +285,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -308,7 +350,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
internal::functor_traits<UnaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -375,16 +419,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum { enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
TensorEvaluator<RightArgType, Device>::PacketAccess &
internal::functor_traits<BinaryOp>::PacketAccess, internal::functor_traits<BinaryOp>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
CoordAccess = false, // to be implemented TensorEvaluator<RightArgType, Device>::BlockAccess,
RawAccess = false Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
}; };
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), : m_device(device),
m_functor(op.functor()),
m_leftImpl(op.lhsExpression(), device), m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device) m_rightImpl(op.rhsExpression(), device)
{ {
@ -399,6 +448,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
static const int NumDims = internal::array_size<
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
typedef internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumDims,
TensorEvaluator<LeftArgType, Device>::Layout>
TensorBlock;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{ {
// TODO: use right impl instead if right impl dimensions are known at compile time. // TODO: use right impl instead if right impl dimensions are known at compile time.
@ -433,6 +490,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {
m_leftImpl.getResourceRequirements(resources);
m_rightImpl.getResourceRequirements(resources);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
TensorBlock* output_block) const {
if (NumDims <= 0) {
output_block->data()[0] = coeff(0);
return;
}
internal::TensorBlockView<LeftArgType, Device> left_block(
m_device, m_leftImpl, *output_block);
internal::TensorBlockView<RightArgType, Device> right_block(
m_device, m_rightImpl, *output_block);
internal::TensorBlockCwiseBinaryIO<
BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
Layout>::Run(m_functor, output_block->block_sizes(),
output_block->block_strides(), output_block->data(),
left_block.block_strides(), left_block.data(),
right_block.block_strides(), right_block.data());
}
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
@ -442,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
BinaryOp functor() const { return m_functor; } BinaryOp functor() const { return m_functor; }
private: private:
const Device& m_device;
const BinaryOp m_functor; const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl; TensorEvaluator<RightArgType, Device> m_rightImpl;
@ -458,6 +540,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess, internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout, Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -562,6 +645,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
internal::packet_traits<Scalar>::HasBlend, internal::packet_traits<Scalar>::HasBlend,
BlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout, Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -12,31 +12,40 @@
namespace Eigen { namespace Eigen {
/** \class TensorExecutor /**
* \ingroup CXX11_Tensor_Module * \class TensorExecutor
* * \ingroup CXX11_Tensor_Module
* \brief The tensor executor class. *
* * \brief The tensor executor class.
* This class is responsible for launch the evaluation of the expression on *
* the specified computing device. * This class is responsible for launch the evaluation of the expression on
*/ * the specified computing device.
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
* @tparam Tileable can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal { namespace internal {
// Default strategy: the expression is evaluated with a single cpu thread. /**
template<typename Expression, typename Device, bool Vectorizable> * Default strategy: the expression is evaluated sequentially with a single cpu
class TensorExecutor * thread, without vectorization and block evaluation.
{ */
template <typename Expression, typename Device, bool Vectorizable,
bool Tileable>
class TensorExecutor {
public: public:
typedef typename Expression::Index Index; using StorageIndex = typename Expression::Index;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const Device& device = Device()) static inline void run(const Expression& expr,
{ const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device); TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) if (needs_assign) {
{ const StorageIndex size = array_prod(evaluator.dimensions());
const Index size = array_prod(evaluator.dimensions()); for (StorageIndex i = 0; i < size; ++i) {
for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
} }
} }
@ -44,35 +53,40 @@ class TensorExecutor
} }
}; };
/**
template<typename Expression> * Process all the data with a single cpu thread, using vectorized instructions.
class TensorExecutor<Expression, DefaultDevice, true> */
{ template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
/*Tileable*/ false> {
public: public:
typedef typename Expression::Index Index; using StorageIndex = typename Expression::Index;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) static inline void run(const Expression& expr,
{ const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) if (needs_assign) {
{ const StorageIndex size = array_prod(evaluator.dimensions());
const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; Expression, DefaultDevice>::PacketReturnType>::size;
// Give the compiler a strong hint to unroll the loop. But don't insist
// on unrolling, because if the function is expensive the compiler should not // Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining. // unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; const StorageIndex UnrolledSize =
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index j = 0; j < 4; j++) { for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize); evaluator.evalPacket(i + j * PacketSize);
} }
} }
const Index VectorizedSize = (size / PacketSize) * PacketSize; const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i); evaluator.evalPacket(i);
} }
for (Index i = VectorizedSize; i < size; ++i) { for (StorageIndex i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
} }
} }
@ -80,41 +94,107 @@ class TensorExecutor<Expression, DefaultDevice, true>
} }
}; };
/**
* Process all the data with a single cpu thread, using blocks of data. By
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tileable*/ true> {
public:
using Scalar = typename traits<Expression>::Scalar;
using ScalarNoConst = typename remove_const<Scalar>::type;
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
using StorageIndex = typename traits<Expression>::Index;
// Multicore strategy: the index space is partitioned and each partition is executed on a single core static const int NumDims = traits<Expression>::NumDimensions;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
using TensorBlock =
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
NumDims, Evaluator::Layout>;
Evaluator evaluator(expr, device);
Index total_size = array_prod(evaluator.dimensions());
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tileable*/ false>::run(expr, device);
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Size tensor blocks to fit in cache (or requested target block size).
Index block_total_size = numext::mini(cache_size, total_size);
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
// Query expression tree for desired block size/shape.
std::vector<TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
block_total_size);
block_total_size = block_mapper.block_dims_total_size();
Scalar* data = static_cast<Scalar*>(
device.allocate(block_total_size * sizeof(Scalar)));
const StorageIndex total_block_count = block_mapper.total_block_count();
for (StorageIndex i = 0; i < total_block_count; ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
evaluator.evalBlock(&block);
}
device.deallocate(data);
}
evaluator.cleanup();
}
};
/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*/
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EvalRange { struct EvalRange {
static void run(Evaluator* evaluator_in, const Index first, const Index last) { static void run(Evaluator* evaluator_in, const StorageIndex first,
const StorageIndex last) {
Evaluator evaluator = *evaluator_in; Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first); eigen_assert(last >= first);
for (Index i = first; i < last; ++i) { for (StorageIndex i = first; i < last; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
} }
} }
static Index alignBlockSize(Index size) { static StorageIndex alignBlockSize(StorageIndex size) { return size; }
return size;
}
}; };
template <typename Evaluator, typename Index> template <typename Evaluator, typename StorageIndex>
struct EvalRange<Evaluator, Index, true> { struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; static const int PacketSize =
unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(Evaluator* evaluator_in, const Index first, const Index last) { static void run(Evaluator* evaluator_in, const StorageIndex first,
const StorageIndex last) {
Evaluator evaluator = *evaluator_in; Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first); eigen_assert(last >= first);
Index i = first; StorageIndex i = first;
if (last - first >= PacketSize) { if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0); eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize; StorageIndex last_chunk_offset = last - 4 * PacketSize;
// Give the compiler a strong hint to unroll the loop. But don't insist // Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive the compiler should not // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining. // unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) { for (; i <= last_chunk_offset; i += 4 * PacketSize) {
for (Index j = 0; j < 4; j++) { for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize); evaluator.evalPacket(i + j * PacketSize);
} }
} }
@ -128,7 +208,7 @@ struct EvalRange<Evaluator, Index, true> {
} }
} }
static Index alignBlockSize(Index size) { static StorageIndex alignBlockSize(StorageIndex size) {
// Align block size to packet size and account for unrolling in run above. // Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) { if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@ -138,106 +218,165 @@ struct EvalRange<Evaluator, Index, true> {
} }
}; };
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public: public:
typedef typename Expression::Index Index; using StorageIndex = typename Expression::Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
size_t num_threads = device.numThreads();
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
Barrier barrier(numblocks); static inline void run(const Expression& expr,
for (int i = 0; i < numblocks; ++i) { const ThreadPoolDevice& device) {
device.enqueue_with_barrier( typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
&evaluator, i * blocksize, (i + 1) * blocksize);
} Evaluator evaluator(expr, device);
if (numblocks * blocksize < size) { const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
EvalRange<Evaluator, Index, Vectorizable>::run( if (needs_assign) {
&evaluator, numblocks * blocksize, size); const StorageIndex PacketSize =
} Vectorizable
barrier.Wait(); ? unpacket_traits<typename Evaluator::PacketReturnType>::size
} : 1;
const StorageIndex size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[&evaluator](StorageIndex first, StorageIndex last) {
EvalRange::run(&evaluator, first, last);
});
} }
evaluator.cleanup(); evaluator.cleanup();
} }
}; };
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
public:
using Scalar = typename traits<Expression>::Scalar;
using ScalarNoConst = typename remove_const<Scalar>::type;
using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
using StorageIndex = typename traits<Expression>::Index;
static const int NumDims = traits<Expression>::NumDimensions;
static inline void run(const Expression& expr,
const ThreadPoolDevice& device) {
using TensorBlock =
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
using TensorBlockMapper =
TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
Evaluator evaluator(expr, device);
StorageIndex total_size = array_prod(evaluator.dimensions());
StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
false>::run(expr, device);
evaluator.cleanup();
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
Index block_total_size = 0;
// Query expression tree for desired block size/shape.
std::vector<internal::TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
int num_threads = device.numThreads();
// Estimate minimum block size based on cost.
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
size_t block_size = static_cast<size_t>(1.0 / taskSize);
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
block_size);
block_size = block_mapper.block_dims_total_size();
const size_t aligned_blocksize =
EIGEN_MAX_ALIGN_BYTES *
divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
device.parallelFor(
block_mapper.total_block_count(), cost * block_size,
[=, &device, &evaluator, &block_mapper](StorageIndex first,
StorageIndex last) {
// currentThreadId() returns -1 if called from a thread not in the
// thread pool, such as the main thread dispatching Eigen
// expressions.
const int thread_idx = device.currentThreadId();
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
Scalar* thread_buf = reinterpret_cast<Scalar*>(
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
for (StorageIndex i = first; i < last; ++i) {
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
evaluator.evalBlock(&block);
}
});
device.deallocate(buf);
}
evaluator.cleanup();
}
};
#endif // EIGEN_USE_THREADS #endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU. // GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU) #if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable> { class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device); static void run(const Expression& expr, const GpuDevice& device);
}; };
#if defined(EIGEN_GPUCC) #if defined(EIGEN_GPUCC)
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval { struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, Index first, Index last, Index step_size) { void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
for (Index i = first; i < last; i += step_size) { for (StorageIndex i = first; i < last; i += step_size) {
eval.evalScalar(i); eval.evalScalar(i);
} }
} }
}; };
template <typename Evaluator, typename Index> template <typename Evaluator, typename StorageIndex>
struct EigenMetaKernelEval<Evaluator, Index, true> { struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
static __device__ EIGEN_ALWAYS_INLINE static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, Index first, Index last, Index step_size) { void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const Index vectorized_size = (last / PacketSize) * PacketSize; const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
const Index vectorized_step_size = step_size * PacketSize; const StorageIndex vectorized_step_size = step_size * PacketSize;
// Use the vector path // Use the vector path
for (Index i = first * PacketSize; i < vectorized_size; for (StorageIndex i = first * PacketSize; i < vectorized_size;
i += vectorized_step_size) { i += vectorized_step_size) {
eval.evalPacket(i); eval.evalPacket(i);
} }
for (Index i = vectorized_size + first; i < last; i += step_size) { for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
eval.evalScalar(i); eval.evalScalar(i);
} }
} }
}; };
template <typename Evaluator, typename Index> template <typename Evaluator, typename StorageIndex>
__global__ void __global__ void
__launch_bounds__(1024) __launch_bounds__(1024)
EigenMetaKernel(Evaluator eval, Index size) { EigenMetaKernel(Evaluator eval, StorageIndex size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x; const StorageIndex step_size = blockDim.x * gridDim.x;
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
} }
/*static*/ /*static*/
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const Expression& expr, const GpuDevice& device) { const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -246,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
const int block_size = device.maxGpuThreadsPerBlock(); const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks = device.getNumGpuMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions()); const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL( LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
num_blocks, block_size, 0, device, evaluator, size); num_blocks, block_size, 0, device, evaluator, size);
} }
evaluator.cleanup(); evaluator.cleanup();

View File

@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
enum { enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor, Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true, CoordAccess = true,
RawAccess = true RawAccess = true

View File

@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = (PacketSize > 1), PacketAccess = (PacketSize > 1),
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true RawAccess = true
}; };

View File

@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
TensorEvaluator<Expression, GpuDevice>::IsAligned; TensorEvaluator<Expression, GpuDevice>::IsAligned;
}; };
template <typename Device, typename Expression>
struct IsTileable {
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
};
template <typename Expression, typename Device, template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value> bool Vectorizable = IsVectorizable<Device, Expression>::value,
bool Tileable = IsTileable<Device, Expression>::value>
class TensorExecutor; class TensorExecutor;
} // end namespace internal } // end namespace internal

View File

@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false

View File

@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented CoordAccess = false // to be implemented
}; };

View File

@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
// slice offsets and sizes. // slice offsets and sizes.
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false
@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
static const int NumDims = internal::array_size<Strides>::value; static const int NumDims = internal::array_size<Strides>::value;
typedef typename XprType::Index Index; typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions; typedef Strides Dimensions;
@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
} }
return inputIndex; return inputIndex;
} }
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__ #ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value)); return numext::maxi(min, numext::mini(max,value));
@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
typedef typename XprType::Index Index; typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions; typedef Strides Dimensions;

View File

@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true, CoordAccess = true,
RawAccess = false RawAccess = false

View File

@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false

View File

@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = Self::InputPacketAccess && Op::PacketAccess, PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = PlainObjectType::Layout, Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorRef<Derived>::Layout, Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1), PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1), PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false
@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
for (int i = 0; i < NumInputDims; ++i) { for (int i = 0; i < NumInputDims; ++i) {
m_reduced[i] = false; m_reduced[i] = false;
} }
const Dims& op_dims = op.dims(); const Dims& op_dims = op.dims();
for (int i = 0; i < NumReducedDims; ++i) { for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op_dims[i] >= 0); eigen_assert(op_dims[i] >= 0);
@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
eigen_assert(num_distinct_reduce_dims == NumReducedDims); eigen_assert(num_distinct_reduce_dims == NumReducedDims);
// Compute the dimensions of the result. // Compute the dimensions of the result.
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int output_index = 0; int output_index = 0;
@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
result += m_impl.coeff(cur_index); result += m_impl.coeff(cur_index);
cur_index += index_stride; cur_index += index_stride;
} }
return result; return result;
} }

View File

@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_striding) ei_add_test(cxx11_tensor_striding)
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_ref)
ei_add_test(cxx11_tensor_random) ei_add_test(cxx11_tensor_random)
ei_add_test(cxx11_tensor_generator) ei_add_test(cxx11_tensor_generator)

View File

@ -19,11 +19,75 @@ using Eigen::Index;
using Eigen::RowMajor; using Eigen::RowMajor;
using Eigen::ColMajor; using Eigen::ColMajor;
using internal::TensorBlockShapeType;
template<typename T> template<typename T>
static const T& choose(int layout, const T& col, const T& row) { static const T& choose(int layout, const T& col, const T& row) {
return layout == ColMajor ? col : row; return layout == ColMajor ? col : row;
} }
static const TensorBlockShapeType RandomShape() {
return internal::random<bool>()
? internal::TensorBlockShapeType::kUniformAllDims
: internal::TensorBlockShapeType::kSkewedInnerDims;
}
template <int NumDims>
static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
return internal::random<int>(1, dims.TotalSize());
}
template <int NumDims>
static DSizes<Index, NumDims> RandomDims() {
array<Index, NumDims> dims;
for (int i = 0; i < NumDims; ++i) {
dims[i] = internal::random<int>(1, 20);
}
return DSizes<Index, NumDims>(dims);
};
/** Dummy data type to test TensorBlock copy ops. */
struct Data {
Data() : Data(0) {}
explicit Data(int v) { value = v; }
int value;
};
bool operator==(const Data& lhs, const Data& rhs) {
return lhs.value == rhs.value;
}
std::ostream& operator<<(std::ostream& os, const Data& d) {
os << "Data: value=" << d.value;
return os;
}
template <typename T>
static T* GenerateRandomData(const Index& size) {
T* data = new T[size];
for (int i = 0; i < size; ++i) {
data[i] = internal::random<T>();
}
return data;
}
template <>
Data* GenerateRandomData(const Index& size) {
Data* data = new Data[size];
for (int i = 0; i < size; ++i) {
data[i] = Data(internal::random<int>(1, 100));
}
return data;
}
template <int NumDims>
static void Debug(DSizes<Index, NumDims> dims) {
for (int i = 0; i < NumDims; ++i) {
std::cout << dims[i] << "; ";
}
std::cout << std::endl;
}
template <int Layout> template <int Layout>
static void test_block_mapper_sanity() static void test_block_mapper_sanity()
{ {
@ -74,10 +138,8 @@ static void test_block_mapper_sanity()
// index in the visited set. Verify that every coeff accessed only once. // index in the visited set. Verify that every coeff accessed only once.
template <typename T, int Layout, int NumDims> template <typename T, int Layout, int NumDims>
static void UpdateCoeffSet( static void UpdateCoeffSet(
const internal::TensorBlock<T, Index, 4, Layout>& block, const internal::TensorBlock<T, Index, NumDims, Layout>& block,
Index first_coeff_index, Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
int dim_index,
std::set<Index>* visited_coeffs) {
const DSizes<Index, NumDims> block_sizes = block.block_sizes(); const DSizes<Index, NumDims> block_sizes = block.block_sizes();
const DSizes<Index, NumDims> tensor_strides = block.tensor_strides(); const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
@ -94,89 +156,840 @@ static void UpdateCoeffSet(
} }
} }
template <int Layout> template <typename T, int NumDims, int Layout>
static void test_block_mapper_maps_every_element() static void test_block_mapper_maps_every_element() {
{ using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
using T = int; using TensorBlockMapper =
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>; internal::TensorBlockMapper<T, Index, NumDims, Layout>;
using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 4, Layout>;
DSizes<Index, 4> dims(5, 7, 11, 17); DSizes<Index, NumDims> dims = RandomDims<NumDims>();
auto total_coeffs = static_cast<int>(dims.TotalSize());
// Keep track of elements indices available via block access. // Keep track of elements indices available via block access.
std::set<Index> coeff_set; std::set<Index> coeff_set;
// Try different combinations of block types and sizes. // Try different combinations of block types and sizes.
auto block_shape_type = TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims));
internal::random<bool>()
? internal::TensorBlockShapeType::kUniformAllDims
: internal::TensorBlockShapeType::kSkewedInnerDims;
auto block_target_size = internal::random<int>(1, total_coeffs);
TensorBlockMapper block_mapper(dims, block_shape_type, block_target_size);
for (int i = 0; i < block_mapper.total_block_count(); ++i) { for (int i = 0; i < block_mapper.total_block_count(); ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(), UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
choose(Layout, 3, 0), &coeff_set); choose(Layout, NumDims - 1, 0),
&coeff_set);
} }
// Verify that every coefficient in the original Tensor is accessible through // Verify that every coefficient in the original Tensor is accessible through
// TensorBlock only once. // TensorBlock only once.
Index total_coeffs = dims.TotalSize();
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0)); VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1)); VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
} }
template <int Layout> template <typename T, int NumDims, int Layout>
static void test_slice_block_mapper_maps_every_element() static void test_slice_block_mapper_maps_every_element() {
{ using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
using T = int;
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
using TensorSliceBlockMapper = using TensorSliceBlockMapper =
internal::TensorSliceBlockMapper<T, Index, 4, Layout>; internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
DSizes<Index, 4> tensor_dims(5,7,11,17); DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
DSizes<Index, 4> tensor_slice_offsets(1,3,5,7); DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
DSizes<Index, 4> tensor_slice_extents(3,2,4,5); DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
// Make sure that tensor offsets + extents do not overflow.
for (int i = 0; i < NumDims; ++i) {
tensor_slice_offsets[i] =
numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
tensor_slice_extents[i] = numext::mini(
tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
}
// Keep track of elements indices available via block access. // Keep track of elements indices available via block access.
std::set<Index> coeff_set; std::set<Index> coeff_set;
auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize()); auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
// Try different combinations of block types and sizes.
auto block_shape_type =
internal::random<bool>()
? internal::TensorBlockShapeType::kUniformAllDims
: internal::TensorBlockShapeType::kSkewedInnerDims;
auto block_target_size = internal::random<int>(1, total_coeffs);
// Pick a random dimension sizes for the tensor blocks. // Pick a random dimension sizes for the tensor blocks.
DSizes<Index, 4> block_sizes; DSizes<Index, NumDims> block_sizes;
for (int i = 0; i < 4; ++i) { for (int i = 0; i < NumDims; ++i) {
block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]); block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
} }
TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets, TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
tensor_slice_extents, block_sizes, tensor_slice_extents, block_sizes,
DimensionList<Index, 4>()); DimensionList<Index, NumDims>());
for (int i = 0; i < block_mapper.total_block_count(); ++i) { for (int i = 0; i < block_mapper.total_block_count(); ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(), UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
choose(Layout, 3, 0), &coeff_set); choose(Layout, NumDims - 1, 0),
&coeff_set);
} }
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
} }
EIGEN_DECLARE_TEST(cxx11_tensor_assign) { template <typename T, int NumDims, int Layout>
CALL_SUBTEST(test_block_mapper_sanity<ColMajor>()); static void test_block_io_copy_data_from_source_to_target() {
CALL_SUBTEST(test_block_mapper_sanity<RowMajor>()); typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
CALL_SUBTEST(test_block_mapper_maps_every_element<ColMajor>()); typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
CALL_SUBTEST(test_block_mapper_maps_every_element<RowMajor>()); TensorBlockMapper;
CALL_SUBTEST(test_slice_block_mapper_maps_every_element<ColMajor>());
CALL_SUBTEST(test_slice_block_mapper_maps_every_element<RowMajor>()); typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
TensorBlockReader;
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
TensorBlockWriter;
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
const auto input_tensor_size = input_tensor_dims.TotalSize();
T* input_data = GenerateRandomData<T>(input_tensor_size);
T* output_data = new T[input_tensor_size];
TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
RandomTargetSize(input_tensor_dims));
T* block_data = new T[block_mapper.block_dims_total_size()];
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
TensorBlockReader::Run(&block, input_data);
TensorBlockWriter::Run(block, output_data);
}
for (int i = 0; i < input_tensor_size; ++i) {
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
}
delete[] input_data;
delete[] output_data;
delete[] block_data;
} }
template <int Layout, int NumDims>
static int GetInputIndex(Index output_index,
const array<Index, NumDims>& output_to_input_dim_map,
const array<Index, NumDims>& input_strides,
const array<Index, NumDims>& output_strides) {
int input_index = 0;
if (Layout == ColMajor) {
for (int i = NumDims - 1; i > 0; --i) {
const int idx = output_index / output_strides[i];
input_index += idx * input_strides[output_to_input_dim_map[i]];
output_index -= idx * output_strides[i];
}
return input_index +
output_index * input_strides[output_to_input_dim_map[0]];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const int idx = output_index / output_strides[i];
input_index += idx * input_strides[output_to_input_dim_map[i]];
output_index -= idx * output_strides[i];
}
return input_index +
output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
}
}
template <int Layout, int NumDims>
static array<Index, NumDims> ComputeStrides(
const array<Index, NumDims>& sizes) {
array<Index, NumDims> strides;
if (Layout == ColMajor) {
strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
strides[i] = strides[i - 1] * sizes[i - 1];
}
} else {
strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * sizes[i + 1];
}
}
return strides;
}
template <typename T, int NumDims, int Layout>
static void test_block_io_copy_using_reordered_dimensions() {
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
TensorBlockMapper;
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
TensorBlockReader;
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
TensorBlockWriter;
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
const auto input_tensor_size = input_tensor_dims.TotalSize();
// Create a random input tensor.
T* input_data = GenerateRandomData<T>(input_tensor_size);
// Create a random dimension re-ordering/shuffle.
std::vector<Index> shuffle;
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
DSizes<Index, NumDims> output_tensor_dims;
array<Index, NumDims> input_to_output_dim_map;
array<Index, NumDims> output_to_input_dim_map;
for (Index i = 0; i < NumDims; ++i) {
output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
input_to_output_dim_map[i] = shuffle[i];
output_to_input_dim_map[shuffle[i]] = i;
}
// Random block shape and size.
TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
RandomTargetSize(input_tensor_dims));
auto* block_data = new T[block_mapper.block_dims_total_size()];
auto* output_data = new T[input_tensor_size];
array<Index, NumDims> input_tensor_strides =
ComputeStrides<Layout, NumDims>(input_tensor_dims);
array<Index, NumDims> output_tensor_strides =
ComputeStrides<Layout, NumDims>(output_tensor_dims);
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
block.first_coeff_index(), output_to_input_dim_map,
input_tensor_strides, output_tensor_strides);
TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
input_tensor_strides, input_data);
TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
input_tensor_strides, output_data);
}
for (int i = 0; i < input_tensor_size; ++i) {
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
}
delete[] input_data;
delete[] block_data;
delete[] output_data;
}
template <int Layout>
static void test_block_io_zero_stride()
{
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
typedef internal::TensorBlockReader<float, Index, 5, Layout>
TensorBlockReader;
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
TensorBlockWriter;
DSizes<Index, 5> rnd_dims = RandomDims<5>();
DSizes<Index, 5> input_tensor_dims = rnd_dims;
input_tensor_dims[0] = 1;
input_tensor_dims[2] = 1;
input_tensor_dims[4] = 1;
const auto input_tensor_size = input_tensor_dims.TotalSize();
auto* input_data = GenerateRandomData<float>(input_tensor_size);
DSizes<Index, 5> output_tensor_dims = rnd_dims;
DSizes<Index, 5> input_tensor_strides(
ComputeStrides<Layout, 5>(input_tensor_dims));
DSizes<Index, 5> output_tensor_strides(
ComputeStrides<Layout, 5>(output_tensor_dims));
DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
input_tensor_strides_with_zeros[0] = 0;
input_tensor_strides_with_zeros[2] = 0;
input_tensor_strides_with_zeros[4] = 0;
// Verify that data was correctly read/written from/into the block.
const auto verify_is_equal = [&](const float* output_data) {
for (int i = 0; i < output_tensor_dims[0]; ++i) {
for (int j = 0; j < output_tensor_dims[1]; ++j) {
for (int k = 0; k < output_tensor_dims[2]; ++k) {
for (int l = 0; l < output_tensor_dims[3]; ++l) {
for (int m = 0; m < output_tensor_dims[4]; ++m) {
const Index output_offset =
i * output_tensor_strides[0] + j * output_tensor_strides[1] +
k * output_tensor_strides[2] + l * output_tensor_strides[3] +
m * output_tensor_strides[4];
const Index input_offset =
i % input_tensor_dims[0] * input_tensor_strides[0] +
j % input_tensor_dims[1] * input_tensor_strides[1] +
k % input_tensor_dims[2] * input_tensor_strides[2] +
l % input_tensor_dims[3] * input_tensor_strides[3] +
m % input_tensor_dims[4] * input_tensor_strides[4];
VERIFY_IS_EQUAL(output_data[output_offset],
input_data[input_offset]);
}
}
}
}
}
};
{
auto* output_data = new float[output_tensor_dims.TotalSize()];
TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
input_tensor_strides_with_zeros, output_data);
TensorBlockReader::Run(&read_block, input_data);
verify_is_equal(output_data);
delete[] output_data;
}
{
auto* output_data = new float[output_tensor_dims.TotalSize()];
TensorBlock write_block(0, output_tensor_dims,
input_tensor_strides_with_zeros,
output_tensor_strides, input_data);
TensorBlockWriter::Run(write_block, output_data);
verify_is_equal(output_data);
delete[] output_data;
}
delete[] input_data;
}
template <int Layout>
static void test_block_io_squeeze_ones() {
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
typedef internal::TensorBlockReader<float, Index, 5, Layout>
TensorBlockReader;
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
TensorBlockWriter;
// Total size > 1.
{
DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
const auto total_size = block_sizes.TotalSize();
// Create a random input tensor.
auto* input_data = GenerateRandomData<float>(total_size);
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
{
auto* output_data = new float[block_sizes.TotalSize()];
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
TensorBlockReader::Run(&read_block, input_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
}
delete[] output_data;
}
{
auto* output_data = new float[block_sizes.TotalSize()];
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
TensorBlockWriter::Run(write_block, output_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
}
delete[] output_data;
}
}
// Total size == 1.
{
DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
const auto total_size = block_sizes.TotalSize();
// Create a random input tensor.
auto* input_data = GenerateRandomData<float>(total_size);
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
{
auto* output_data = new float[block_sizes.TotalSize()];
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
TensorBlockReader::Run(&read_block, input_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
}
delete[] output_data;
}
{
auto* output_data = new float[block_sizes.TotalSize()];
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
TensorBlockWriter::Run(write_block, output_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
}
delete[] output_data;
}
}
}
template <typename T, int NumDims, int Layout>
static void test_block_cwise_binary_io_basic() {
typedef internal::scalar_sum_op<T> BinaryFunctor;
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
Layout>
TensorBlockCwiseBinaryIO;
DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
const auto total_size = block_sizes.TotalSize();
// Create a random input tensors.
T* left_data = GenerateRandomData<T>(total_size);
T* right_data = GenerateRandomData<T>(total_size);
T* output_data = new T[total_size];
BinaryFunctor functor;
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
strides, left_data, strides, right_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
}
delete[] left_data;
delete[] right_data;
delete[] output_data;
}
template <int Layout>
static void test_block_cwise_binary_io_squeeze_ones() {
typedef internal::scalar_sum_op<float> BinaryFunctor;
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
Layout>
TensorBlockCwiseBinaryIO;
DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
const auto total_size = block_sizes.TotalSize();
// Create a random input tensors.
auto* left_data = GenerateRandomData<float>(total_size);
auto* right_data = GenerateRandomData<float>(total_size);
auto* output_data = new float[total_size];
BinaryFunctor functor;
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
strides, left_data, strides, right_data);
for (int i = 0; i < total_size; ++i) {
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
}
delete[] left_data;
delete[] right_data;
delete[] output_data;
}
template <int Layout>
static void test_block_cwise_binary_io_zero_strides() {
typedef internal::scalar_sum_op<float> BinaryFunctor;
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
Layout>
TensorBlockCwiseBinaryIO;
DSizes<Index, 5> rnd_dims = RandomDims<5>();
DSizes<Index, 5> left_sizes = rnd_dims;
left_sizes[0] = 1;
left_sizes[2] = 1;
left_sizes[4] = 1;
DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
left_strides[0] = 0;
left_strides[2] = 0;
left_strides[4] = 0;
DSizes<Index, 5> right_sizes = rnd_dims;
right_sizes[1] = 0;
right_sizes[3] = 0;
DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
right_strides[1] = 0;
right_strides[3] = 0;
// Generate random data.
auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
DSizes<Index, 5> output_sizes = rnd_dims;
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
const auto output_total_size = output_sizes.TotalSize();
auto* output_data = new float[output_total_size];
BinaryFunctor functor;
TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
output_data, left_strides, left_data,
right_strides, right_data);
for (int i = 0; i < rnd_dims[0]; ++i) {
for (int j = 0; j < rnd_dims[1]; ++j) {
for (int k = 0; k < rnd_dims[2]; ++k) {
for (int l = 0; l < rnd_dims[3]; ++l) {
for (int m = 0; m < rnd_dims[4]; ++m) {
Index output_index = i * output_strides[0] + j * output_strides[1] +
k * output_strides[2] + l * output_strides[3] +
m * output_strides[4];
Index left_index = i * left_strides[0] + j * left_strides[1] +
k * left_strides[2] + l * left_strides[3] +
m * left_strides[4];
Index right_index = i * right_strides[0] + j * right_strides[1] +
k * right_strides[2] + l * right_strides[3] +
m * right_strides[4];
VERIFY_IS_EQUAL(
output_data[output_index],
functor(left_data[left_index], right_data[right_index]));
}
}
}
}
}
delete[] left_data;
delete[] right_data;
delete[] output_data;
}
template <int Layout>
static void test_uniform_block_shape()
{
using T = int;
typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
{
// Test shape 'UniformAllDims' with uniform 'max_coeff count'.
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
// partially into first inner-most dimension.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
for (int i = 1; i < 5; ++i) {
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
for (int i = 3; i >= 0; --i) {
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
// fully into first inner-most dimension.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
for (int i = 1; i < 5; ++i) {
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
for (int i = 3; i >= 0; --i) {
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
// fully into first few inner-most dimensions.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(7, 5, 6, 17, 7);
const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
VERIFY_IS_EQUAL(7, block.block_sizes()[3]);
VERIFY_IS_EQUAL(5, block.block_sizes()[4]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(7, 5, 6, 9, 7);
const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY_IS_EQUAL(6, block.block_sizes()[3]);
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(5, block.block_sizes()[0]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'UniformAllDims' with full allocation to all dims.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(7, 5, 6, 17, 7);
const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(7, 5, 6, 9, 7);
const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY_IS_EQUAL(9, block.block_sizes()[3]);
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
}
template <int Layout>
static void test_skewed_inner_dim_block_shape()
{
using T = int;
typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
// Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(10, block.block_sizes()[0]);
for (int i = 1; i < 5; ++i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
for (int i = 3; i >= 0; --i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
for (int i = 1; i < 5; ++i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
for (int i = 3; i >= 0; --i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
// and partial allocation to second inner-dim.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
VERIFY_IS_EQUAL(3, block.block_sizes()[1]);
for (int i = 2; i < 5; ++i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY_IS_EQUAL(15, block.block_sizes()[3]);
for (int i = 2; i >= 0; --i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
// and partial allocation to third inner-dim.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
for (int i = 3; i < 5; ++i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
for (int i = 1; i >= 0; --i) {
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
}
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
// Test shape 'SkewedInnerDims' with full allocation to all dims.
if (Layout == ColMajor) {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
} else {
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
max_coeff_count);
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
}
}
template <int Layout>
static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
{
using T = int;
// Test blocking of tensors with zero dimensions:
// - we must not crash on asserts and divisions by zero
// - we must not return block with zero dimensions
// (recipe for overflows/underflows, divisions by zero and NaNs later)
// - total block count must be zero
{
typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper;
DSizes<Index, 1> dims(0);
for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
VERIFY(block_mapper.block_dims_total_size() >= 1);
}
}
{
typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper;
for (int dim1 = 0; dim1 < 3; ++dim1) {
for (int dim2 = 0; dim2 < 3; ++dim2) {
DSizes<Index, 2> dims(dim1, dim2);
for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
if (dim1 * dim2 == 0) {
VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
}
VERIFY(block_mapper.block_dims_total_size() >= 1);
}
}
}
}
}
#define TEST_LAYOUTS(NAME) \
CALL_SUBTEST(NAME<ColMajor>()); \
CALL_SUBTEST(NAME<RowMajor>())
#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \
CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
CALL_SUBTEST(NAME<RowMajor>(ARG))
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
TEST_LAYOUTS(test_block_mapper_sanity);
TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
TEST_LAYOUTS(test_block_io_zero_stride);
TEST_LAYOUTS(test_block_io_squeeze_ones);
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
TEST_LAYOUTS(test_uniform_block_shape);
TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
}
#undef TEST_LAYOUTS
#undef TEST_LAYOUTS_WITH_ARG

View File

@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
} }
void test_cxx11_tensor_complex_cwise_ops() EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
{ {
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());

View File

@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
} }
void test_cxx11_tensor_complex() EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
{ {
CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_nullary());
CALL_SUBTEST(test_cuda_sum_reductions()); CALL_SUBTEST(test_cuda_sum_reductions());

View File

@ -0,0 +1,87 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_USE_THREADS
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
using Eigen::ColMajor;
// A set of tests to verify that different TensorExecutor strategies yields the
// same results for all the ops, supporting tiled execution.
template <typename Device, bool Vectorizable, bool Tileable, int Layout>
static void test_execute_binary_expr(Device d) {
// Pick a large enough tensor size to bypass small tensor block evaluation
// optimization.
int d0 = internal::random<int>(100, 200);
int d1 = internal::random<int>(100, 200);
int d2 = internal::random<int>(100, 200);
static constexpr int Options = 0;
using IndexType = int;
Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
lhs.setRandom();
rhs.setRandom();
const auto expr = lhs + rhs;
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
Executor::run(Assign(dst, expr), d);
for (int i = 0; i < d0; ++i) {
for (int j = 0; j < d1; ++j) {
for (int k = 0; k < d2; ++k) {
float sum = lhs(i, j, k) + rhs(i, j, k);
VERIFY_IS_EQUAL(sum, dst(i, j, k));
}
}
}
}
#define CALL_SUBTEST_COMBINATIONS(NAME) \
CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
const auto num_threads = internal::random<int>(1, 24);
Eigen::ThreadPool tp(num_threads);
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
}
#undef CALL_SUBTEST_COMBINATIONS