mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible
This commit is contained in:
parent
6913221c43
commit
966c2a7bb6
@ -67,21 +67,21 @@ enum class TensorBlockShapeType {
|
|||||||
|
|
||||||
struct TensorOpResourceRequirements {
|
struct TensorOpResourceRequirements {
|
||||||
TensorBlockShapeType block_shape;
|
TensorBlockShapeType block_shape;
|
||||||
std::size_t block_total_size;
|
Index block_total_size;
|
||||||
// TODO(andydavis) Add 'target_num_threads' to support communication of
|
// TODO(andydavis) Add 'target_num_threads' to support communication of
|
||||||
// thread-resource requirements. This will allow ops deep in the
|
// thread-resource requirements. This will allow ops deep in the
|
||||||
// expression tree (like reductions) to communicate resources
|
// expression tree (like reductions) to communicate resources
|
||||||
// requirements based on local state (like the total number of reductions
|
// requirements based on local state (like the total number of reductions
|
||||||
// to be computed).
|
// to be computed).
|
||||||
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
|
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
|
||||||
const std::size_t size)
|
const Index size)
|
||||||
: block_shape(shape), block_total_size(size) {}
|
: block_shape(shape), block_total_size(size) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Tries to merge multiple resource requirements.
|
// Tries to merge multiple resource requirements.
|
||||||
EIGEN_STRONG_INLINE void MergeResourceRequirements(
|
EIGEN_STRONG_INLINE void MergeResourceRequirements(
|
||||||
const std::vector<TensorOpResourceRequirements>& resources,
|
const std::vector<TensorOpResourceRequirements>& resources,
|
||||||
TensorBlockShapeType* block_shape, std::size_t* block_total_size) {
|
TensorBlockShapeType* block_shape, Index* block_total_size) {
|
||||||
if (resources.empty()) {
|
if (resources.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -108,12 +108,12 @@ EIGEN_STRONG_INLINE void MergeResourceRequirements(
|
|||||||
* This class represents a tensor block specified by the index of the
|
* This class represents a tensor block specified by the index of the
|
||||||
* first block coefficient, and the size of the block in each dimension.
|
* first block coefficient, and the size of the block in each dimension.
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout>
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||||
class TensorBlock {
|
class TensorBlock {
|
||||||
public:
|
public:
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||||
|
|
||||||
TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes,
|
TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
|
||||||
const Dimensions& block_strides, const Dimensions& tensor_strides,
|
const Dimensions& block_strides, const Dimensions& tensor_strides,
|
||||||
Scalar* data)
|
Scalar* data)
|
||||||
: m_first_coeff_index(first_coeff_index),
|
: m_first_coeff_index(first_coeff_index),
|
||||||
@ -122,7 +122,7 @@ class TensorBlock {
|
|||||||
m_tensor_strides(tensor_strides),
|
m_tensor_strides(tensor_strides),
|
||||||
m_data(data) {}
|
m_data(data) {}
|
||||||
|
|
||||||
Index first_coeff_index() const { return m_first_coeff_index; }
|
StorageIndex first_coeff_index() const { return m_first_coeff_index; }
|
||||||
|
|
||||||
const Dimensions& block_sizes() const { return m_block_sizes; }
|
const Dimensions& block_sizes() const { return m_block_sizes; }
|
||||||
|
|
||||||
@ -135,108 +135,33 @@ class TensorBlock {
|
|||||||
const Scalar* data() const { return m_data; }
|
const Scalar* data() const { return m_data; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Index m_first_coeff_index;
|
StorageIndex m_first_coeff_index;
|
||||||
Dimensions m_block_sizes;
|
Dimensions m_block_sizes;
|
||||||
Dimensions m_block_strides;
|
Dimensions m_block_strides;
|
||||||
Dimensions m_tensor_strides;
|
Dimensions m_tensor_strides;
|
||||||
Scalar* m_data; // Not owned.
|
Scalar* m_data; // Not owned.
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Scalar, typename Index, bool Vectorizable>
|
template <typename Scalar, typename StorageIndex>
|
||||||
struct TensorBlockCopyOp {
|
struct TensorBlockCopyOp {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const Index num_coeff_to_copy, const Index dst_index,
|
const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
|
||||||
const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
||||||
const Index src_index, const Index src_stride,
|
const StorageIndex src_index, const StorageIndex src_stride,
|
||||||
const Scalar* EIGEN_RESTRICT src_data) {
|
const Scalar* EIGEN_RESTRICT src_data) {
|
||||||
for (Index i = 0; i < num_coeff_to_copy; ++i) {
|
const Scalar* src_base = &src_data[src_index];
|
||||||
dst_data[dst_index + i * dst_stride] =
|
Scalar* dst_base = &dst_data[dst_index];
|
||||||
src_data[src_index + i * src_stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// NOTE: Benchmarks run on an implementation of this that broke each of the
|
using Src = const Eigen::Array<Scalar, Dynamic, 1>;
|
||||||
// loops in these conditionals into it's own template specialization (to
|
using Dst = Eigen::Array<Scalar, Dynamic, 1>;
|
||||||
// avoid conditionals in the caller's loop) did not show an improvement.
|
|
||||||
template <typename Scalar, typename Index>
|
using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
|
||||||
struct TensorBlockCopyOp<Scalar, Index, true> {
|
using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
|
||||||
typedef typename packet_traits<Scalar>::type Packet;
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
|
||||||
const Index num_coeff_to_copy, const Index dst_index,
|
DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
|
||||||
const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
|
||||||
const Index src_index, const Index src_stride,
|
dst = src;
|
||||||
const Scalar* EIGEN_RESTRICT src_data) {
|
|
||||||
if (src_stride == 1) {
|
|
||||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
|
||||||
const Index vectorized_size =
|
|
||||||
(num_coeff_to_copy / packet_size) * packet_size;
|
|
||||||
if (dst_stride == 1) {
|
|
||||||
// LINEAR
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
Packet p = internal::ploadu<Packet>(src_data + src_index + i);
|
|
||||||
internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i] = src_data[src_index + i];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// SCATTER
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
Packet p = internal::ploadu<Packet>(src_data + src_index + i);
|
|
||||||
internal::pscatter<Scalar, Packet>(
|
|
||||||
dst_data + dst_index + i * dst_stride, p, dst_stride);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i * dst_stride] = src_data[src_index + i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (src_stride == 0) {
|
|
||||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
|
||||||
const Index vectorized_size =
|
|
||||||
(num_coeff_to_copy / packet_size) * packet_size;
|
|
||||||
if (dst_stride == 1) {
|
|
||||||
// LINEAR
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
Packet p = internal::pload1<Packet>(src_data + src_index);
|
|
||||||
internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i] = src_data[src_index];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// SCATTER
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
Packet p = internal::pload1<Packet>(src_data + src_index);
|
|
||||||
internal::pscatter<Scalar, Packet>(
|
|
||||||
dst_data + dst_index + i * dst_stride, p, dst_stride);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i * dst_stride] = src_data[src_index];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (dst_stride == 1) {
|
|
||||||
// GATHER
|
|
||||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
|
||||||
const Index vectorized_size =
|
|
||||||
(num_coeff_to_copy / packet_size) * packet_size;
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
Packet p = internal::pgather<Scalar, Packet>(
|
|
||||||
src_data + src_index + i * src_stride, src_stride);
|
|
||||||
internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i] = src_data[src_index + i * src_stride];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// RANDOM
|
|
||||||
for (Index i = 0; i < num_coeff_to_copy; ++i) {
|
|
||||||
dst_data[dst_index + i * dst_stride] =
|
|
||||||
src_data[src_index + i * src_stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -249,34 +174,34 @@ struct TensorBlockCopyOp<Scalar, Index, true> {
|
|||||||
* This class is responsible for copying data between a tensor and a tensor
|
* This class is responsible for copying data between a tensor and a tensor
|
||||||
* block.
|
* block.
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout,
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
|
||||||
bool Vectorizable, bool BlockRead>
|
bool BlockRead>
|
||||||
class TensorBlockIO {
|
class TensorBlockIO {
|
||||||
public:
|
public:
|
||||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef typename internal::TensorBlockCopyOp<Scalar, Index, Vectorizable>
|
typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
|
||||||
TensorBlockCopyOp;
|
TensorBlockCopyOp;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
struct BlockIteratorState {
|
struct BlockIteratorState {
|
||||||
Index input_stride;
|
StorageIndex input_stride;
|
||||||
Index output_stride;
|
StorageIndex output_stride;
|
||||||
Index input_span;
|
StorageIndex input_span;
|
||||||
Index output_span;
|
StorageIndex output_span;
|
||||||
Index size;
|
StorageIndex size;
|
||||||
Index count;
|
StorageIndex count;
|
||||||
};
|
};
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
|
||||||
const TensorBlock& block, Index first_coeff_index,
|
const TensorBlock& block, StorageIndex first_coeff_index,
|
||||||
const array<Index, NumDims>& tensor_to_block_dim_map,
|
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||||
const array<Index, NumDims>& tensor_strides, const Scalar* src_data,
|
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
|
||||||
Scalar* dst_data) {
|
Scalar* dst_data) {
|
||||||
// Find the innermost tensor dimension whose size is not 1. This is the
|
// Find the innermost tensor dimension whose size is not 1. This is the
|
||||||
// effective inner dim. If all dimensions are of size 1, then fallback to
|
// effective inner dim. If all dimensions are of size 1, then fallback to
|
||||||
// using the actual innermost dim to avoid out-of-bound access.
|
// using the actual innermost dim to avoid out-of-bound access.
|
||||||
Index num_size_one_inner_dims = 0;
|
StorageIndex num_size_one_inner_dims = 0;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||||
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
|
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
|
||||||
@ -285,16 +210,16 @@ class TensorBlockIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Calculate strides and dimensions.
|
// Calculate strides and dimensions.
|
||||||
const Index tensor_stride1_dim = cond<Layout>()(
|
const StorageIndex tensor_stride1_dim = cond<Layout>()(
|
||||||
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
||||||
const Index block_dim_for_tensor_stride1_dim =
|
const StorageIndex block_dim_for_tensor_stride1_dim =
|
||||||
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
||||||
size_t block_inner_dim_size =
|
size_t block_inner_dim_size =
|
||||||
NumDims == 0 ? 1
|
NumDims == 0 ? 1
|
||||||
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
||||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||||
const Index block_stride =
|
const StorageIndex block_stride =
|
||||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||||
if (block_inner_dim_size == block_stride &&
|
if (block_inner_dim_size == block_stride &&
|
||||||
block_stride == tensor_strides[dim]) {
|
block_stride == tensor_strides[dim]) {
|
||||||
@ -306,10 +231,10 @@ class TensorBlockIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Index inputIndex;
|
StorageIndex inputIndex;
|
||||||
Index outputIndex;
|
StorageIndex outputIndex;
|
||||||
Index input_stride;
|
StorageIndex input_stride;
|
||||||
Index output_stride;
|
StorageIndex output_stride;
|
||||||
|
|
||||||
// Setup strides to read/write along the tensor's stride1 dimension.
|
// Setup strides to read/write along the tensor's stride1 dimension.
|
||||||
if (BlockRead) {
|
if (BlockRead) {
|
||||||
@ -337,7 +262,7 @@ class TensorBlockIO {
|
|||||||
int num_squeezed_dims = 0;
|
int num_squeezed_dims = 0;
|
||||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||||
const Index size = block.block_sizes()[tensor_to_block_dim_map[dim]];
|
const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
|
||||||
if (size == 1) {
|
if (size == 1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -362,9 +287,9 @@ class TensorBlockIO {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Iterate copying data from src to dst.
|
// Iterate copying data from src to dst.
|
||||||
const Index block_total_size =
|
const StorageIndex block_total_size =
|
||||||
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
|
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
|
||||||
for (Index i = 0; i < block_total_size; i += block_inner_dim_size) {
|
for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
|
||||||
TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
|
TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
|
||||||
dst_data, inputIndex, input_stride, src_data);
|
dst_data, inputIndex, input_stride, src_data);
|
||||||
// Update index.
|
// Update index.
|
||||||
@ -391,19 +316,18 @@ class TensorBlockIO {
|
|||||||
* This class is responsible for reading a tensor block.
|
* This class is responsible for reading a tensor block.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout,
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||||
bool Vectorizable>
|
class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||||
class TensorBlockReader
|
Layout, /*BlockRead=*/true> {
|
||||||
: public TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true> {
|
|
||||||
public:
|
public:
|
||||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true>
|
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
|
||||||
Base;
|
Base;
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
TensorBlock* block, const Scalar* src_data) {
|
TensorBlock* block, const Scalar* src_data) {
|
||||||
array<Index, NumDims> tensor_to_block_dim_map;
|
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
tensor_to_block_dim_map[i] = i;
|
tensor_to_block_dim_map[i] = i;
|
||||||
}
|
}
|
||||||
@ -412,9 +336,9 @@ class TensorBlockReader
|
|||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
TensorBlock* block, Index first_coeff_index,
|
TensorBlock* block, StorageIndex first_coeff_index,
|
||||||
const array<Index, NumDims>& tensor_to_block_dim_map,
|
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||||
const array<Index, NumDims>& tensor_strides, const Scalar* src_data) {
|
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
|
||||||
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
|
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
|
||||||
tensor_strides, src_data, block->data());
|
tensor_strides, src_data, block->data());
|
||||||
}
|
}
|
||||||
@ -429,19 +353,18 @@ class TensorBlockReader
|
|||||||
* This class is responsible for writing a tensor block.
|
* This class is responsible for writing a tensor block.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout,
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||||
bool Vectorizable>
|
class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||||
class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
|
Layout, /*BlockRead=*/false> {
|
||||||
Vectorizable, false> {
|
|
||||||
public:
|
public:
|
||||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, false>
|
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
|
||||||
Base;
|
Base;
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const TensorBlock& block, Scalar* dst_data) {
|
const TensorBlock& block, Scalar* dst_data) {
|
||||||
array<Index, NumDims> tensor_to_block_dim_map;
|
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
tensor_to_block_dim_map[i] = i;
|
tensor_to_block_dim_map[i] = i;
|
||||||
}
|
}
|
||||||
@ -450,9 +373,9 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const TensorBlock& block, Index first_coeff_index,
|
const TensorBlock& block, StorageIndex first_coeff_index,
|
||||||
const array<Index, NumDims>& tensor_to_block_dim_map,
|
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||||
const array<Index, NumDims>& tensor_strides, Scalar* dst_data) {
|
const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
|
||||||
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
|
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
|
||||||
tensor_strides, block.data(), dst_data);
|
tensor_strides, block.data(), dst_data);
|
||||||
}
|
}
|
||||||
@ -468,67 +391,34 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
|
|||||||
* result of the cwise binary op to the strided output array.
|
* result of the cwise binary op to the strided output array.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template <bool Vectorizable>
|
|
||||||
struct TensorBlockCwiseBinaryOp {
|
struct TensorBlockCwiseBinaryOp {
|
||||||
template <typename Index, typename BinaryFunctor, typename OutputScalar,
|
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||||
typename LeftScalar, typename RightScalar>
|
typename LeftScalar, typename RightScalar>
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const BinaryFunctor& functor, const Index num_coeff,
|
const BinaryFunctor& functor, const StorageIndex num_coeff,
|
||||||
const Index output_index, const Index output_stride,
|
const StorageIndex output_index, const StorageIndex output_stride,
|
||||||
OutputScalar* output_data, const Index left_index,
|
OutputScalar* output_data, const StorageIndex left_index,
|
||||||
const Index left_stride, const LeftScalar* left_data,
|
const StorageIndex left_stride, const LeftScalar* left_data,
|
||||||
const Index right_index, const Index right_stride,
|
const StorageIndex right_index, const StorageIndex right_stride,
|
||||||
const RightScalar* right_data) {
|
const RightScalar* right_data) {
|
||||||
for (Index i = 0; i < num_coeff; ++i) {
|
using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
|
||||||
output_data[output_index + i * output_stride] =
|
using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
|
||||||
functor(left_data[left_index + i * left_stride],
|
using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
|
||||||
right_data[right_index + i * right_stride]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <>
|
using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
|
||||||
struct TensorBlockCwiseBinaryOp<true> {
|
using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
|
||||||
template <typename Index, typename BinaryFunctor, typename OutputScalar,
|
using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
|
||||||
typename LeftScalar, typename RightScalar>
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
const LeftScalar* lhs_base = &left_data[left_index];
|
||||||
const BinaryFunctor& functor, const Index num_coeff,
|
const RightScalar* rhs_base = &right_data[right_index];
|
||||||
const Index output_index, const Index output_stride,
|
OutputScalar* out_base = &output_data[output_index];
|
||||||
OutputScalar* output_data, const Index left_index,
|
|
||||||
const Index left_stride, const LeftScalar* left_data,
|
const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
|
||||||
const Index right_index, const Index right_stride,
|
const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
|
||||||
const RightScalar* right_data) {
|
OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
|
||||||
EIGEN_STATIC_ASSERT(functor_traits<BinaryFunctor>::PacketAccess,
|
|
||||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
out =
|
||||||
typedef typename packet_traits<OutputScalar>::type OutputPacket;
|
Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
|
||||||
typedef typename packet_traits<LeftScalar>::type LeftPacket;
|
|
||||||
typedef typename packet_traits<RightScalar>::type RightPacket;
|
|
||||||
const Index packet_size = unpacket_traits<OutputPacket>::size;
|
|
||||||
EIGEN_STATIC_ASSERT(unpacket_traits<LeftPacket>::size == packet_size,
|
|
||||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
|
||||||
EIGEN_STATIC_ASSERT(unpacket_traits<RightPacket>::size == packet_size,
|
|
||||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
|
||||||
const Index vectorized_size = (num_coeff / packet_size) * packet_size;
|
|
||||||
if (output_stride != 1 || left_stride != 1 || right_stride != 1) {
|
|
||||||
TensorBlockCwiseBinaryOp<false>::Run(
|
|
||||||
functor, num_coeff, output_index, output_stride, output_data,
|
|
||||||
left_index, left_stride, left_data, right_index, right_stride,
|
|
||||||
right_data);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Vectorization for the most common case.
|
|
||||||
for (Index i = 0; i < vectorized_size; i += packet_size) {
|
|
||||||
LeftPacket l = internal::ploadu<LeftPacket>(left_data + left_index + i);
|
|
||||||
RightPacket r =
|
|
||||||
internal::ploadu<RightPacket>(right_data + right_index + i);
|
|
||||||
OutputPacket p = functor.packetOp(l, r);
|
|
||||||
internal::pstoreu<OutputScalar, OutputPacket>(
|
|
||||||
output_data + output_index + i, p);
|
|
||||||
}
|
|
||||||
for (Index i = vectorized_size; i < num_coeff; ++i) {
|
|
||||||
output_data[output_index + i] =
|
|
||||||
functor(left_data[left_index + i], right_data[right_index + i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -541,28 +431,26 @@ struct TensorBlockCwiseBinaryOp<true> {
|
|||||||
* This class carries out the binary op on given blocks.
|
* This class carries out the binary op on given blocks.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template <typename BinaryFunctor, typename Index, typename OutputScalar,
|
template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
|
||||||
int NumDims, int Layout>
|
int NumDims, int Layout>
|
||||||
struct TensorBlockCwiseBinaryIO {
|
struct TensorBlockCwiseBinaryIO {
|
||||||
typedef typename internal::TensorBlock<OutputScalar, Index, NumDims,
|
typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
|
||||||
Layout>::Dimensions Dimensions;
|
Layout>::Dimensions Dimensions;
|
||||||
typedef internal::TensorBlockCwiseBinaryOp<
|
|
||||||
functor_traits<BinaryFunctor>::PacketAccess>
|
|
||||||
TensorBlockCwiseBinaryOp;
|
|
||||||
|
|
||||||
struct BlockIteratorState {
|
struct BlockIteratorState {
|
||||||
Index output_stride, output_span;
|
StorageIndex output_stride, output_span;
|
||||||
Index left_stride, left_span;
|
StorageIndex left_stride, left_span;
|
||||||
Index right_stride, right_span;
|
StorageIndex right_stride, right_span;
|
||||||
Index size, count;
|
StorageIndex size, count;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename LeftScalar, typename RightScalar>
|
template <typename LeftScalar, typename RightScalar>
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const BinaryFunctor& functor, const Dimensions& block_sizes,
|
const BinaryFunctor& functor, const Dimensions& block_sizes,
|
||||||
const Dimensions& block_strides, OutputScalar* output_data,
|
const Dimensions& block_strides, OutputScalar* output_data,
|
||||||
const array<Index, NumDims>& left_strides, const LeftScalar* left_data,
|
const array<StorageIndex, NumDims>& left_strides,
|
||||||
const array<Index, NumDims>& right_strides,
|
const LeftScalar* left_data,
|
||||||
|
const array<StorageIndex, NumDims>& right_strides,
|
||||||
const RightScalar* right_data) {
|
const RightScalar* right_data) {
|
||||||
// Find the innermost dimension whose size is not 1. This is the effective
|
// Find the innermost dimension whose size is not 1. This is the effective
|
||||||
// inner dim. If all dimensions are of size 1, fallback to using the actual
|
// inner dim. If all dimensions are of size 1, fallback to using the actual
|
||||||
@ -580,7 +468,7 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
NumDims == 0 ? 1
|
NumDims == 0 ? 1
|
||||||
: cond<Layout>()(num_size_one_inner_dims,
|
: cond<Layout>()(num_size_one_inner_dims,
|
||||||
NumDims - num_size_one_inner_dims - 1);
|
NumDims - num_size_one_inner_dims - 1);
|
||||||
Index inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
|
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
|
||||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||||
// Merge multiple inner dims into one for larger inner dim size (i.e.
|
// Merge multiple inner dims into one for larger inner dim size (i.e.
|
||||||
@ -595,10 +483,12 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Index output_index = 0, left_index = 0, right_index = 0;
|
StorageIndex output_index = 0, left_index = 0, right_index = 0;
|
||||||
const Index output_stride = NumDims == 0 ? 1 : block_strides[inner_dim];
|
const StorageIndex output_stride =
|
||||||
const Index left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
|
NumDims == 0 ? 1 : block_strides[inner_dim];
|
||||||
const Index right_stride = NumDims == 0 ? 1 : right_strides[inner_dim];
|
const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
|
||||||
|
const StorageIndex right_stride =
|
||||||
|
NumDims == 0 ? 1 : right_strides[inner_dim];
|
||||||
|
|
||||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||||
@ -607,7 +497,7 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
int num_squeezed_dims = 0;
|
int num_squeezed_dims = 0;
|
||||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||||
const Index size = block_sizes[dim];
|
const StorageIndex size = block_sizes[dim];
|
||||||
if (size == 1) {
|
if (size == 1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -624,8 +514,9 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compute cwise binary op.
|
// Compute cwise binary op.
|
||||||
const Index block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize();
|
const StorageIndex block_total_size =
|
||||||
for (Index i = 0; i < block_total_size; i += inner_dim_size) {
|
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||||
|
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||||
TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
|
TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
|
||||||
output_stride, output_data, left_index,
|
output_stride, output_data, left_index,
|
||||||
left_stride, left_data, right_index,
|
left_stride, left_data, right_index,
|
||||||
@ -661,10 +552,10 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
template <class ArgType, class Device>
|
template <class ArgType, class Device>
|
||||||
struct TensorBlockView {
|
struct TensorBlockView {
|
||||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||||
typedef typename Impl::Index Index;
|
typedef typename Impl::Index StorageIndex;
|
||||||
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
|
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
|
||||||
static const int NumDims = array_size<typename Impl::Dimensions>::value;
|
static const int NumDims = array_size<typename Impl::Dimensions>::value;
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||||
|
|
||||||
// Constructs a TensorBlockView for `impl`. `block` is only used for for
|
// Constructs a TensorBlockView for `impl`. `block` is only used for for
|
||||||
// specifying the start offset, shape, and strides of the block.
|
// specifying the start offset, shape, and strides of the block.
|
||||||
@ -701,7 +592,7 @@ struct TensorBlockView {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TensorBlock<Scalar, Index, NumDims, Impl::Layout> input_block(
|
TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
|
||||||
block.first_coeff_index(), m_block_sizes, m_block_strides,
|
block.first_coeff_index(), m_block_sizes, m_block_strides,
|
||||||
block.tensor_strides(), m_allocated_data);
|
block.tensor_strides(), m_allocated_data);
|
||||||
impl.block(&input_block);
|
impl.block(&input_block);
|
||||||
@ -733,21 +624,21 @@ struct TensorBlockView {
|
|||||||
*
|
*
|
||||||
* This class is responsible for iterating over the blocks of a tensor.
|
* This class is responsible for iterating over the blocks of a tensor.
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout>
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||||
class TensorBlockMapper {
|
class TensorBlockMapper {
|
||||||
public:
|
public:
|
||||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||||
|
|
||||||
TensorBlockMapper(const Dimensions& dims,
|
TensorBlockMapper(const Dimensions& dims,
|
||||||
const TensorBlockShapeType block_shape,
|
const TensorBlockShapeType block_shape,
|
||||||
size_t min_target_size)
|
Index min_target_size)
|
||||||
: m_dimensions(dims),
|
: m_dimensions(dims),
|
||||||
m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
|
m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
|
||||||
// Calculate block counts by dimension and total block count.
|
// Calculate block counts by dimension and total block count.
|
||||||
DSizes<Index, NumDims> block_count;
|
DSizes<StorageIndex, NumDims> block_count;
|
||||||
for (size_t i = 0; i < block_count.rank(); ++i) {
|
for (Index i = 0; i < block_count.rank(); ++i) {
|
||||||
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
|
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
|
||||||
}
|
}
|
||||||
m_total_block_count = array_prod(block_count);
|
m_total_block_count = array_prod(block_count);
|
||||||
@ -773,15 +664,15 @@ class TensorBlockMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
||||||
GetBlockForIndex(Index block_index, Scalar* data) const {
|
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
|
||||||
Index first_coeff_index = 0;
|
StorageIndex first_coeff_index = 0;
|
||||||
DSizes<Index, NumDims> coords;
|
DSizes<StorageIndex, NumDims> coords;
|
||||||
DSizes<Index, NumDims> sizes;
|
DSizes<StorageIndex, NumDims> sizes;
|
||||||
DSizes<Index, NumDims> strides;
|
DSizes<StorageIndex, NumDims> strides;
|
||||||
if (NumDims > 0) {
|
if (NumDims > 0) {
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = block_index / m_block_strides[i];
|
const StorageIndex idx = block_index / m_block_strides[i];
|
||||||
coords[i] = idx * m_block_dim_sizes[i];
|
coords[i] = idx * m_block_dim_sizes[i];
|
||||||
sizes[i] =
|
sizes[i] =
|
||||||
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
||||||
@ -799,7 +690,7 @@ class TensorBlockMapper {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = block_index / m_block_strides[i];
|
const StorageIndex idx = block_index / m_block_strides[i];
|
||||||
coords[i] = idx * m_block_dim_sizes[i];
|
coords[i] = idx * m_block_dim_sizes[i];
|
||||||
sizes[i] =
|
sizes[i] =
|
||||||
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
||||||
@ -824,19 +715,20 @@ class TensorBlockMapper {
|
|||||||
data);
|
data);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
|
||||||
return m_total_block_count;
|
return m_total_block_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
|
||||||
|
block_dims_total_size() const {
|
||||||
return m_block_dim_sizes.TotalSize();
|
return m_block_dim_sizes.TotalSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
|
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
|
||||||
const TensorBlockShapeType block_shape,
|
const TensorBlockShapeType block_shape,
|
||||||
size_t min_target_size) {
|
Index min_target_size) {
|
||||||
min_target_size = numext::maxi<size_t>(1, min_target_size);
|
min_target_size = numext::maxi<Index>(1, min_target_size);
|
||||||
|
|
||||||
// If tensor fully fits into the target size, we'll treat it a single block.
|
// If tensor fully fits into the target size, we'll treat it a single block.
|
||||||
Dimensions block_dim_sizes = tensor_dims;
|
Dimensions block_dim_sizes = tensor_dims;
|
||||||
@ -865,14 +757,14 @@ class TensorBlockMapper {
|
|||||||
dim_size_target, static_cast<size_t>(tensor_dims[i]));
|
dim_size_target, static_cast<size_t>(tensor_dims[i]));
|
||||||
}
|
}
|
||||||
// Add any un-allocated coefficients to inner dimension(s).
|
// Add any un-allocated coefficients to inner dimension(s).
|
||||||
Index total_size = block_dim_sizes.TotalSize();
|
StorageIndex total_size = block_dim_sizes.TotalSize();
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||||
if (block_dim_sizes[dim] < tensor_dims[dim]) {
|
if (block_dim_sizes[dim] < tensor_dims[dim]) {
|
||||||
const Index total_size_other_dims =
|
const StorageIndex total_size_other_dims =
|
||||||
total_size / block_dim_sizes[dim];
|
total_size / block_dim_sizes[dim];
|
||||||
const Index alloc_avail =
|
const StorageIndex alloc_avail =
|
||||||
divup<Index>(min_target_size, total_size_other_dims);
|
divup<StorageIndex>(min_target_size, total_size_other_dims);
|
||||||
if (alloc_avail == block_dim_sizes[dim]) {
|
if (alloc_avail == block_dim_sizes[dim]) {
|
||||||
// Insufficient excess coefficients to allocate.
|
// Insufficient excess coefficients to allocate.
|
||||||
break;
|
break;
|
||||||
@ -882,14 +774,14 @@ class TensorBlockMapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
|
} else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
|
||||||
Index coeff_to_allocate = min_target_size;
|
StorageIndex coeff_to_allocate = min_target_size;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||||
block_dim_sizes[dim] =
|
block_dim_sizes[dim] =
|
||||||
numext::mini(coeff_to_allocate, tensor_dims[dim]);
|
numext::mini(coeff_to_allocate, tensor_dims[dim]);
|
||||||
coeff_to_allocate =
|
coeff_to_allocate = divup(
|
||||||
divup(coeff_to_allocate,
|
coeff_to_allocate,
|
||||||
numext::maxi(static_cast<Index>(1), block_dim_sizes[dim]));
|
numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
|
||||||
}
|
}
|
||||||
eigen_assert(coeff_to_allocate == 1);
|
eigen_assert(coeff_to_allocate == 1);
|
||||||
} else {
|
} else {
|
||||||
@ -908,7 +800,7 @@ class TensorBlockMapper {
|
|||||||
Dimensions m_block_dim_sizes;
|
Dimensions m_block_dim_sizes;
|
||||||
Dimensions m_block_strides;
|
Dimensions m_block_strides;
|
||||||
Dimensions m_tensor_strides;
|
Dimensions m_tensor_strides;
|
||||||
Index m_total_block_count;
|
StorageIndex m_total_block_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -923,12 +815,12 @@ class TensorBlockMapper {
|
|||||||
* processed together.
|
* processed together.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, int NumDims, int Layout>
|
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||||
class TensorSliceBlockMapper {
|
class TensorSliceBlockMapper {
|
||||||
public:
|
public:
|
||||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||||
|
|
||||||
TensorSliceBlockMapper(const Dimensions& tensor_dims,
|
TensorSliceBlockMapper(const Dimensions& tensor_dims,
|
||||||
const Dimensions& tensor_slice_offsets,
|
const Dimensions& tensor_slice_offsets,
|
||||||
@ -942,7 +834,7 @@ class TensorSliceBlockMapper {
|
|||||||
m_block_stride_order(block_stride_order),
|
m_block_stride_order(block_stride_order),
|
||||||
m_total_block_count(1) {
|
m_total_block_count(1) {
|
||||||
// Calculate block counts by dimension and total block count.
|
// Calculate block counts by dimension and total block count.
|
||||||
DSizes<Index, NumDims> block_count;
|
DSizes<StorageIndex, NumDims> block_count;
|
||||||
for (size_t i = 0; i < block_count.rank(); ++i) {
|
for (size_t i = 0; i < block_count.rank(); ++i) {
|
||||||
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
|
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
|
||||||
}
|
}
|
||||||
@ -969,11 +861,11 @@ class TensorSliceBlockMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
||||||
GetBlockForIndex(Index block_index, Scalar* data) const {
|
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
|
||||||
Index first_coeff_index = 0;
|
StorageIndex first_coeff_index = 0;
|
||||||
DSizes<Index, NumDims> coords;
|
DSizes<StorageIndex, NumDims> coords;
|
||||||
DSizes<Index, NumDims> sizes;
|
DSizes<StorageIndex, NumDims> sizes;
|
||||||
DSizes<Index, NumDims> strides;
|
DSizes<StorageIndex, NumDims> strides;
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = block_index / m_block_strides[i];
|
const Index idx = block_index / m_block_strides[i];
|
||||||
@ -991,16 +883,16 @@ class TensorSliceBlockMapper {
|
|||||||
m_block_dim_sizes[0]);
|
m_block_dim_sizes[0]);
|
||||||
first_coeff_index += coords[0] * m_tensor_strides[0];
|
first_coeff_index += coords[0] * m_tensor_strides[0];
|
||||||
|
|
||||||
Index prev_dim = m_block_stride_order[0];
|
StorageIndex prev_dim = m_block_stride_order[0];
|
||||||
strides[prev_dim] = 1;
|
strides[prev_dim] = 1;
|
||||||
for (int i = 1; i < NumDims; ++i) {
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
const Index curr_dim = m_block_stride_order[i];
|
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||||
prev_dim = curr_dim;
|
prev_dim = curr_dim;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = block_index / m_block_strides[i];
|
const StorageIndex idx = block_index / m_block_strides[i];
|
||||||
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
|
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
|
||||||
sizes[i] = numext::mini(
|
sizes[i] = numext::mini(
|
||||||
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
|
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
|
||||||
@ -1016,10 +908,10 @@ class TensorSliceBlockMapper {
|
|||||||
m_block_dim_sizes[NumDims - 1]);
|
m_block_dim_sizes[NumDims - 1]);
|
||||||
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
|
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
|
||||||
|
|
||||||
Index prev_dim = m_block_stride_order[NumDims - 1];
|
StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
|
||||||
strides[prev_dim] = 1;
|
strides[prev_dim] = 1;
|
||||||
for (int i = NumDims - 2; i >= 0; --i) {
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
const Index curr_dim = m_block_stride_order[i];
|
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||||
prev_dim = curr_dim;
|
prev_dim = curr_dim;
|
||||||
}
|
}
|
||||||
@ -1029,7 +921,7 @@ class TensorSliceBlockMapper {
|
|||||||
data);
|
data);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
|
||||||
return m_total_block_count;
|
return m_total_block_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1041,7 +933,7 @@ class TensorSliceBlockMapper {
|
|||||||
Dimensions m_block_dim_sizes;
|
Dimensions m_block_dim_sizes;
|
||||||
Dimensions m_block_stride_order;
|
Dimensions m_block_stride_order;
|
||||||
Dimensions m_block_strides;
|
Dimensions m_block_strides;
|
||||||
Index m_total_block_count;
|
StorageIndex m_total_block_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
// This file is part of Eigen, a lightweight C++ template library
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
// This file is part of Eigen, a lightweight C++ template library
|
|
||||||
// for linear algebra.
|
// for linear algebra.
|
||||||
//
|
//
|
||||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||||
|
@ -51,12 +51,10 @@ struct TensorEvaluator
|
|||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef typename internal::TensorBlockReader<
|
typedef typename internal::TensorBlockReader<
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
PacketAccess>
|
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef typename internal::TensorBlockWriter<
|
typedef typename internal::TensorBlockWriter<
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
PacketAccess>
|
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||||
@ -204,8 +202,7 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef typename internal::TensorBlockReader<
|
typedef typename internal::TensorBlockReader<
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
PacketAccess>
|
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
|
|
||||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||||
|
@ -36,15 +36,16 @@ template <typename Expression, typename Device, bool Vectorizable,
|
|||||||
bool Tileable>
|
bool Tileable>
|
||||||
class TensorExecutor {
|
class TensorExecutor {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
using StorageIndex = typename Expression::Index;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static inline void run(const Expression& expr,
|
static inline void run(const Expression& expr,
|
||||||
const Device& device = Device()) {
|
const Device& device = Device()) {
|
||||||
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
for (Index i = 0; i < size; ++i) {
|
for (StorageIndex i = 0; i < size; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -56,35 +57,36 @@ class TensorExecutor {
|
|||||||
* Process all the data with a single cpu thread, using vectorized instructions.
|
* Process all the data with a single cpu thread, using vectorized instructions.
|
||||||
*/
|
*/
|
||||||
template <typename Expression>
|
template <typename Expression>
|
||||||
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
|
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
|
||||||
|
/*Tileable*/ false> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
using StorageIndex = typename Expression::Index;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
static inline void run(const Expression& expr,
|
||||||
{
|
const DefaultDevice& device = DefaultDevice()) {
|
||||||
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign)
|
if (needs_assign) {
|
||||||
{
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
|
||||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<
|
const int PacketSize = unpacket_traits<typename TensorEvaluator<
|
||||||
Expression, DefaultDevice>::PacketReturnType>::size;
|
Expression, DefaultDevice>::PacketReturnType>::size;
|
||||||
|
|
||||||
// Give compiler a strong possibility to unroll the loop. But don't insist
|
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||||
// on unrolling, because if the function is expensive compiler should not
|
// on unrolling, because if the function is expensive compiler should not
|
||||||
// unroll the loop at the expense of inlining.
|
// unroll the loop at the expense of inlining.
|
||||||
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
const StorageIndex UnrolledSize =
|
||||||
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
(size / (4 * PacketSize)) * 4 * PacketSize;
|
||||||
for (Index j = 0; j < 4; j++) {
|
for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
|
||||||
|
for (StorageIndex j = 0; j < 4; j++) {
|
||||||
evaluator.evalPacket(i + j * PacketSize);
|
evaluator.evalPacket(i + j * PacketSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
|
||||||
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||||
evaluator.evalPacket(i);
|
evaluator.evalPacket(i);
|
||||||
}
|
}
|
||||||
for (Index i = VectorizedSize; i < size; ++i) {
|
for (StorageIndex i = VectorizedSize; i < size; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,42 +99,41 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable
|
|||||||
* sizing a block to fit L1 cache we get better cache performance.
|
* sizing a block to fit L1 cache we get better cache performance.
|
||||||
*/
|
*/
|
||||||
template <typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable>
|
||||||
class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
|
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||||
|
/*Tileable*/ true> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
using Scalar = typename traits<Expression>::Scalar;
|
||||||
|
using ScalarNoConst = typename remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
|
||||||
|
using StorageIndex = typename traits<Expression>::Index;
|
||||||
|
|
||||||
|
static const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static inline void run(const Expression& expr,
|
static inline void run(const Expression& expr,
|
||||||
const DefaultDevice& device = DefaultDevice()) {
|
const DefaultDevice& device = DefaultDevice()) {
|
||||||
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
|
|
||||||
|
|
||||||
using Index = typename traits<Expression>::Index;
|
|
||||||
const int NumDims = traits<Expression>::NumDimensions;
|
|
||||||
|
|
||||||
using Scalar = typename traits<Expression>::Scalar;
|
|
||||||
using ScalarNoConst = typename remove_const<Scalar>::type;
|
|
||||||
|
|
||||||
using TensorBlock =
|
using TensorBlock =
|
||||||
TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
|
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||||
using TensorBlockMapper =
|
using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
|
||||||
TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
|
NumDims, Evaluator::Layout>;
|
||||||
|
|
||||||
Evaluator evaluator(expr, device);
|
Evaluator evaluator(expr, device);
|
||||||
std::size_t total_size = array_prod(evaluator.dimensions());
|
Index total_size = array_prod(evaluator.dimensions());
|
||||||
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
|
|
||||||
if (total_size < cache_size) {
|
if (total_size < cache_size) {
|
||||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||||
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
|
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
|
||||||
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||||
false>::run(expr, device);
|
/*Tileable*/ false>::run(expr, device);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
// Size tensor blocks to fit in cache (or requested target block size).
|
// Size tensor blocks to fit in cache (or requested target block size).
|
||||||
size_t block_total_size = numext::mini(cache_size, total_size);
|
Index block_total_size = numext::mini(cache_size, total_size);
|
||||||
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||||
// Query expression tree for desired block size/shape.
|
// Query expression tree for desired block size/shape.
|
||||||
std::vector<TensorOpResourceRequirements> resources;
|
std::vector<TensorOpResourceRequirements> resources;
|
||||||
@ -146,8 +147,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
|
|||||||
Scalar* data = static_cast<Scalar*>(
|
Scalar* data = static_cast<Scalar*>(
|
||||||
device.allocate(block_total_size * sizeof(Scalar)));
|
device.allocate(block_total_size * sizeof(Scalar)));
|
||||||
|
|
||||||
const Index total_block_count = block_mapper.total_block_count();
|
const StorageIndex total_block_count = block_mapper.total_block_count();
|
||||||
for (Index i = 0; i < total_block_count; ++i) {
|
for (StorageIndex i = 0; i < total_block_count; ++i) {
|
||||||
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
|
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
|
||||||
evaluator.evalBlock(&block);
|
evaluator.evalBlock(&block);
|
||||||
}
|
}
|
||||||
@ -162,37 +163,38 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
|
|||||||
* executed on a single core.
|
* executed on a single core.
|
||||||
*/
|
*/
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
struct EvalRange {
|
struct EvalRange {
|
||||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
||||||
|
const StorageIndex last) {
|
||||||
Evaluator evaluator = *evaluator_in;
|
Evaluator evaluator = *evaluator_in;
|
||||||
eigen_assert(last >= first);
|
eigen_assert(last >= first);
|
||||||
for (Index i = first; i < last; ++i) {
|
for (StorageIndex i = first; i < last; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Index alignBlockSize(Index size) {
|
static StorageIndex alignBlockSize(StorageIndex size) { return size; }
|
||||||
return size;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Evaluator, typename Index>
|
template <typename Evaluator, typename StorageIndex>
|
||||||
struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
|
struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
|
||||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
static const int PacketSize =
|
||||||
|
unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
|
|
||||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
||||||
|
const StorageIndex last) {
|
||||||
Evaluator evaluator = *evaluator_in;
|
Evaluator evaluator = *evaluator_in;
|
||||||
eigen_assert(last >= first);
|
eigen_assert(last >= first);
|
||||||
Index i = first;
|
StorageIndex i = first;
|
||||||
if (last - first >= PacketSize) {
|
if (last - first >= PacketSize) {
|
||||||
eigen_assert(first % PacketSize == 0);
|
eigen_assert(first % PacketSize == 0);
|
||||||
Index last_chunk_offset = last - 4 * PacketSize;
|
StorageIndex last_chunk_offset = last - 4 * PacketSize;
|
||||||
// Give compiler a strong possibility to unroll the loop. But don't insist
|
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||||
// on unrolling, because if the function is expensive compiler should not
|
// on unrolling, because if the function is expensive compiler should not
|
||||||
// unroll the loop at the expense of inlining.
|
// unroll the loop at the expense of inlining.
|
||||||
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
for (; i <= last_chunk_offset; i += 4 * PacketSize) {
|
||||||
for (Index j = 0; j < 4; j++) {
|
for (StorageIndex j = 0; j < 4; j++) {
|
||||||
evaluator.evalPacket(i + j * PacketSize);
|
evaluator.evalPacket(i + j * PacketSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,7 +208,7 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Index alignBlockSize(Index size) {
|
static StorageIndex alignBlockSize(StorageIndex size) {
|
||||||
// Align block size to packet size and account for unrolling in run above.
|
// Align block size to packet size and account for unrolling in run above.
|
||||||
if (size >= 16 * PacketSize) {
|
if (size >= 16 * PacketSize) {
|
||||||
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
|
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
|
||||||
@ -219,24 +221,24 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
|
|||||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
using StorageIndex = typename Expression::Index;
|
||||||
|
|
||||||
static inline void run(const Expression& expr,
|
static inline void run(const Expression& expr,
|
||||||
const ThreadPoolDevice& device) {
|
const ThreadPoolDevice& device) {
|
||||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
|
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
|
||||||
|
|
||||||
Evaluator evaluator(expr, device);
|
Evaluator evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
const Index PacketSize =
|
const StorageIndex PacketSize =
|
||||||
Vectorizable
|
Vectorizable
|
||||||
? unpacket_traits<typename Evaluator::PacketReturnType>::size
|
? unpacket_traits<typename Evaluator::PacketReturnType>::size
|
||||||
: 1;
|
: 1;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||||
EvalRange::alignBlockSize,
|
EvalRange::alignBlockSize,
|
||||||
[&evaluator](Index first, Index last) {
|
[&evaluator](StorageIndex first, StorageIndex last) {
|
||||||
EvalRange::run(&evaluator, first, last);
|
EvalRange::run(&evaluator, first, last);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -247,24 +249,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
|||||||
template <typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable>
|
||||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
using Scalar = typename traits<Expression>::Scalar;
|
||||||
|
using ScalarNoConst = typename remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
|
||||||
|
using StorageIndex = typename traits<Expression>::Index;
|
||||||
|
|
||||||
|
static const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
static inline void run(const Expression& expr,
|
static inline void run(const Expression& expr,
|
||||||
const ThreadPoolDevice& device) {
|
const ThreadPoolDevice& device) {
|
||||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
using TensorBlock =
|
||||||
typedef typename internal::remove_const<
|
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||||
typename traits<Expression>::Scalar>::type Scalar;
|
using TensorBlockMapper =
|
||||||
typedef typename traits<Expression>::Index Index;
|
TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||||
|
|
||||||
static const int NumDims = traits<Expression>::NumDimensions;
|
|
||||||
|
|
||||||
typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
|
|
||||||
typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
|
|
||||||
TensorBlockMapper;
|
|
||||||
|
|
||||||
Evaluator evaluator(expr, device);
|
Evaluator evaluator(expr, device);
|
||||||
std::size_t total_size = array_prod(evaluator.dimensions());
|
StorageIndex total_size = array_prod(evaluator.dimensions());
|
||||||
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
if (total_size < cache_size) {
|
if (total_size < cache_size) {
|
||||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||||
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||||
@ -276,7 +278,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||||
size_t block_total_size = 0;
|
Index block_total_size = 0;
|
||||||
// Query expression tree for desired block size/shape.
|
// Query expression tree for desired block size/shape.
|
||||||
std::vector<internal::TensorOpResourceRequirements> resources;
|
std::vector<internal::TensorOpResourceRequirements> resources;
|
||||||
evaluator.getResourceRequirements(&resources);
|
evaluator.getResourceRequirements(&resources);
|
||||||
@ -296,15 +298,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||||
device.parallelFor(
|
device.parallelFor(
|
||||||
block_mapper.total_block_count(), cost * block_size,
|
block_mapper.total_block_count(), cost * block_size,
|
||||||
[=, &device, &evaluator, &block_mapper](Index first, Index last) {
|
[=, &device, &evaluator, &block_mapper](StorageIndex first,
|
||||||
|
StorageIndex last) {
|
||||||
// currentThreadId() returns -1 if called from a thread not in the
|
// currentThreadId() returns -1 if called from a thread not in the
|
||||||
// threadpool, such as the main thread dispatching Eigen
|
// thread pool, such as the main thread dispatching Eigen
|
||||||
// expressions.
|
// expressions.
|
||||||
const int thread_idx = device.currentThreadId();
|
const int thread_idx = device.currentThreadId();
|
||||||
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
||||||
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
||||||
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
||||||
for (Index i = first; i < last; ++i) {
|
for (StorageIndex i = first; i < last; ++i) {
|
||||||
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
||||||
evaluator.evalBlock(&block);
|
evaluator.evalBlock(&block);
|
||||||
}
|
}
|
||||||
@ -324,51 +327,51 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||||
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index StorageIndex;
|
||||||
static void run(const Expression& expr, const GpuDevice& device);
|
static void run(const Expression& expr, const GpuDevice& device);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_GPUCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
struct EigenMetaKernelEval {
|
struct EigenMetaKernelEval {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
||||||
for (Index i = first; i < last; i += step_size) {
|
for (StorageIndex i = first; i < last; i += step_size) {
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Evaluator, typename Index>
|
template <typename Evaluator, typename StorageIndex>
|
||||||
struct EigenMetaKernelEval<Evaluator, Index, true> {
|
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
||||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
const Index vectorized_size = (last / PacketSize) * PacketSize;
|
const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
|
||||||
const Index vectorized_step_size = step_size * PacketSize;
|
const StorageIndex vectorized_step_size = step_size * PacketSize;
|
||||||
|
|
||||||
// Use the vector path
|
// Use the vector path
|
||||||
for (Index i = first * PacketSize; i < vectorized_size;
|
for (StorageIndex i = first * PacketSize; i < vectorized_size;
|
||||||
i += vectorized_step_size) {
|
i += vectorized_step_size) {
|
||||||
eval.evalPacket(i);
|
eval.evalPacket(i);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size + first; i < last; i += step_size) {
|
for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Evaluator, typename Index>
|
template <typename Evaluator, typename StorageIndex>
|
||||||
__global__ void
|
__global__ void
|
||||||
__launch_bounds__(1024)
|
__launch_bounds__(1024)
|
||||||
EigenMetaKernel(Evaluator eval, Index size) {
|
EigenMetaKernel(Evaluator eval, StorageIndex size) {
|
||||||
|
|
||||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const Index step_size = blockDim.x * gridDim.x;
|
const StorageIndex step_size = blockDim.x * gridDim.x;
|
||||||
|
|
||||||
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
||||||
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*static*/
|
/*static*/
|
||||||
@ -382,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
|
|||||||
const int block_size = device.maxGpuThreadsPerBlock();
|
const int block_size = device.maxGpuThreadsPerBlock();
|
||||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||||
|
|
||||||
LAUNCH_GPU_KERNEL(
|
LAUNCH_GPU_KERNEL(
|
||||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
|
||||||
num_blocks, block_size, 0, device, evaluator, size);
|
num_blocks, block_size, 0, device, evaluator, size);
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
|
@ -37,6 +37,31 @@ static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
|
|||||||
return internal::random<int>(1, dims.TotalSize());
|
return internal::random<int>(1, dims.TotalSize());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static DSizes<Index, NumDims> RandomDims() {
|
||||||
|
array<Index, NumDims> dims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
dims[i] = internal::random<int>(1, 20);
|
||||||
|
}
|
||||||
|
return DSizes<Index, NumDims>(dims);
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Dummy data type to test TensorBlock copy ops. */
|
||||||
|
struct Data {
|
||||||
|
Data() : Data(0) {}
|
||||||
|
explicit Data(int v) { value = v; }
|
||||||
|
int value;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool operator==(const Data& lhs, const Data& rhs) {
|
||||||
|
return lhs.value == rhs.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os, const Data& d) {
|
||||||
|
os << "Data: value=" << d.value;
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T* GenerateRandomData(const Index& size) {
|
static T* GenerateRandomData(const Index& size) {
|
||||||
T* data = new T[size];
|
T* data = new T[size];
|
||||||
@ -46,6 +71,23 @@ static T* GenerateRandomData(const Index& size) {
|
|||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Data* GenerateRandomData(const Index& size) {
|
||||||
|
Data* data = new Data[size];
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
data[i] = Data(internal::random<int>(1, 100));
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int NumDims>
|
||||||
|
static void Debug(DSizes<Index, NumDims> dims) {
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
std::cout << dims[i] << "; ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <int Layout>
|
||||||
static void test_block_mapper_sanity()
|
static void test_block_mapper_sanity()
|
||||||
{
|
{
|
||||||
@ -96,7 +138,7 @@ static void test_block_mapper_sanity()
|
|||||||
// index in the visited set. Verify that every coeff accessed only once.
|
// index in the visited set. Verify that every coeff accessed only once.
|
||||||
template <typename T, int Layout, int NumDims>
|
template <typename T, int Layout, int NumDims>
|
||||||
static void UpdateCoeffSet(
|
static void UpdateCoeffSet(
|
||||||
const internal::TensorBlock<T, Index, 4, Layout>& block,
|
const internal::TensorBlock<T, Index, NumDims, Layout>& block,
|
||||||
Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
|
Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
|
||||||
const DSizes<Index, NumDims> block_sizes = block.block_sizes();
|
const DSizes<Index, NumDims> block_sizes = block.block_sizes();
|
||||||
const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
|
const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
|
||||||
@ -114,14 +156,13 @@ static void UpdateCoeffSet(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <typename T, int NumDims, int Layout>
|
||||||
static void test_block_mapper_maps_every_element()
|
static void test_block_mapper_maps_every_element() {
|
||||||
{
|
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||||
using T = int;
|
using TensorBlockMapper =
|
||||||
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
|
internal::TensorBlockMapper<T, Index, NumDims, Layout>;
|
||||||
using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 4, Layout>;
|
|
||||||
|
|
||||||
DSizes<Index, 4> dims(5, 7, 11, 17);
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>();
|
||||||
|
|
||||||
// Keep track of elements indices available via block access.
|
// Keep track of elements indices available via block access.
|
||||||
std::set<Index> coeff_set;
|
std::set<Index> coeff_set;
|
||||||
@ -131,29 +172,36 @@ static void test_block_mapper_maps_every_element()
|
|||||||
|
|
||||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
||||||
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
|
UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
|
||||||
choose(Layout, 3, 0), &coeff_set);
|
choose(Layout, NumDims - 1, 0),
|
||||||
|
&coeff_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify that every coefficient in the original Tensor is accessible through
|
// Verify that every coefficient in the original Tensor is accessible through
|
||||||
// TensorBlock only once.
|
// TensorBlock only once.
|
||||||
auto total_coeffs = static_cast<int>(dims.TotalSize());
|
Index total_coeffs = dims.TotalSize();
|
||||||
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
||||||
VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0));
|
VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
|
||||||
VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1));
|
VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <typename T, int NumDims, int Layout>
|
||||||
static void test_slice_block_mapper_maps_every_element()
|
static void test_slice_block_mapper_maps_every_element() {
|
||||||
{
|
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||||
using T = int;
|
|
||||||
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
|
|
||||||
using TensorSliceBlockMapper =
|
using TensorSliceBlockMapper =
|
||||||
internal::TensorSliceBlockMapper<T, Index, 4, Layout>;
|
internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
|
||||||
|
|
||||||
DSizes<Index, 4> tensor_dims(5,7,11,17);
|
DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
|
||||||
DSizes<Index, 4> tensor_slice_offsets(1,3,5,7);
|
DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
|
||||||
DSizes<Index, 4> tensor_slice_extents(3,2,4,5);
|
DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
|
||||||
|
|
||||||
|
// Make sure that tensor offsets + extents do not overflow.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
tensor_slice_offsets[i] =
|
||||||
|
numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
|
||||||
|
tensor_slice_extents[i] = numext::mini(
|
||||||
|
tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
|
||||||
|
}
|
||||||
|
|
||||||
// Keep track of elements indices available via block access.
|
// Keep track of elements indices available via block access.
|
||||||
std::set<Index> coeff_set;
|
std::set<Index> coeff_set;
|
||||||
@ -161,61 +209,59 @@ static void test_slice_block_mapper_maps_every_element()
|
|||||||
auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
|
auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
|
||||||
|
|
||||||
// Pick a random dimension sizes for the tensor blocks.
|
// Pick a random dimension sizes for the tensor blocks.
|
||||||
DSizes<Index, 4> block_sizes;
|
DSizes<Index, NumDims> block_sizes;
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
|
block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
|
TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
|
||||||
tensor_slice_extents, block_sizes,
|
tensor_slice_extents, block_sizes,
|
||||||
DimensionList<Index, 4>());
|
DimensionList<Index, NumDims>());
|
||||||
|
|
||||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
||||||
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
|
UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
|
||||||
choose(Layout, 3, 0), &coeff_set);
|
choose(Layout, NumDims - 1, 0),
|
||||||
|
&coeff_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <typename T, int NumDims, int Layout>
|
||||||
static void test_block_io_copy_data_from_source_to_target()
|
static void test_block_io_copy_data_from_source_to_target() {
|
||||||
{
|
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||||
using T = float;
|
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||||
|
TensorBlockMapper;
|
||||||
|
|
||||||
typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
|
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||||
typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
|
|
||||||
|
|
||||||
typedef internal::TensorBlockReader<T, Index, 5, Layout, true>
|
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef internal::TensorBlockWriter<T, Index, 5, Layout, true>
|
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
typedef std::vector<T, aligned_allocator<T>> DataVector;
|
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||||
|
|
||||||
DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3);
|
|
||||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||||
DataVector input_data(input_tensor_size, 0);
|
|
||||||
for (int i = 0; i < input_tensor_size; ++i) {
|
|
||||||
input_data[i] = internal::random<T>();
|
|
||||||
}
|
|
||||||
|
|
||||||
DataVector output_data(input_tensor_size, 0);
|
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||||
|
T* output_data = new T[input_tensor_size];
|
||||||
|
|
||||||
TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
|
TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
|
||||||
RandomTargetSize(input_tensor_dims));
|
RandomTargetSize(input_tensor_dims));
|
||||||
|
T* block_data = new T[block_mapper.block_dims_total_size()];
|
||||||
|
|
||||||
DataVector block_data(block_mapper.block_dims_total_size(), 0);
|
|
||||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data.data());
|
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||||
TensorBlockReader::Run(&block, input_data.data());
|
TensorBlockReader::Run(&block, input_data);
|
||||||
TensorBlockWriter::Run(block, output_data.data());
|
TensorBlockWriter::Run(block, output_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < input_tensor_size; ++i) {
|
for (int i = 0; i < input_tensor_size; ++i) {
|
||||||
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete[] input_data;
|
||||||
|
delete[] output_data;
|
||||||
|
delete[] block_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout, int NumDims>
|
template <int Layout, int NumDims>
|
||||||
@ -261,31 +307,32 @@ static array<Index, NumDims> ComputeStrides(
|
|||||||
return strides;
|
return strides;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <typename T, int NumDims, int Layout>
|
||||||
static void test_block_io_copy_using_reordered_dimensions() {
|
static void test_block_io_copy_using_reordered_dimensions() {
|
||||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||||
typedef internal::TensorBlockMapper<float, Index, 5, Layout>
|
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||||
TensorBlockMapper;
|
TensorBlockMapper;
|
||||||
|
|
||||||
typedef internal::TensorBlockReader<float, Index, 5, Layout, false>
|
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout, false>
|
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3);
|
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||||
|
|
||||||
// Create a random input tensor.
|
// Create a random input tensor.
|
||||||
auto* input_data = GenerateRandomData<float>(input_tensor_size);
|
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||||
|
|
||||||
// Create a random dimension re-ordering/shuffle.
|
// Create a random dimension re-ordering/shuffle.
|
||||||
std::vector<Index> shuffle = {0, 1, 2, 3, 4};
|
std::vector<Index> shuffle;
|
||||||
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||||
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
||||||
|
|
||||||
DSizes<Index, 5> output_tensor_dims;
|
DSizes<Index, NumDims> output_tensor_dims;
|
||||||
array<Index, 5> input_to_output_dim_map;
|
array<Index, NumDims> input_to_output_dim_map;
|
||||||
array<Index, 5> output_to_input_dim_map;
|
array<Index, NumDims> output_to_input_dim_map;
|
||||||
for (Index i = 0; i < 5; ++i) {
|
for (Index i = 0; i < NumDims; ++i) {
|
||||||
output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
|
output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
|
||||||
input_to_output_dim_map[i] = shuffle[i];
|
input_to_output_dim_map[i] = shuffle[i];
|
||||||
output_to_input_dim_map[shuffle[i]] = i;
|
output_to_input_dim_map[shuffle[i]] = i;
|
||||||
@ -295,17 +342,17 @@ static void test_block_io_copy_using_reordered_dimensions() {
|
|||||||
TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
|
TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
|
||||||
RandomTargetSize(input_tensor_dims));
|
RandomTargetSize(input_tensor_dims));
|
||||||
|
|
||||||
auto* block_data = new float[block_mapper.block_dims_total_size()];
|
auto* block_data = new T[block_mapper.block_dims_total_size()];
|
||||||
auto* output_data = new float[input_tensor_size];
|
auto* output_data = new T[input_tensor_size];
|
||||||
|
|
||||||
array<Index, 5> input_tensor_strides =
|
array<Index, NumDims> input_tensor_strides =
|
||||||
ComputeStrides<Layout, 5>(input_tensor_dims);
|
ComputeStrides<Layout, NumDims>(input_tensor_dims);
|
||||||
array<Index, 5> output_tensor_strides =
|
array<Index, NumDims> output_tensor_strides =
|
||||||
ComputeStrides<Layout, 5>(output_tensor_dims);
|
ComputeStrides<Layout, NumDims>(output_tensor_dims);
|
||||||
|
|
||||||
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
|
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||||
const Index first_coeff_index = GetInputIndex<Layout, 5>(
|
const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
|
||||||
block.first_coeff_index(), output_to_input_dim_map,
|
block.first_coeff_index(), output_to_input_dim_map,
|
||||||
input_tensor_strides, output_tensor_strides);
|
input_tensor_strides, output_tensor_strides);
|
||||||
TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
|
TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
|
||||||
@ -327,18 +374,21 @@ template <int Layout>
|
|||||||
static void test_block_io_zero_stride()
|
static void test_block_io_zero_stride()
|
||||||
{
|
{
|
||||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||||
typedef internal::TensorBlockReader<float, Index, 5, Layout, true>
|
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout, true>
|
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
DSizes<Index, 5> input_tensor_dims(1, 2, 1, 3, 1);
|
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
|
||||||
|
|
||||||
// Create a random input tensor.
|
DSizes<Index, 5> input_tensor_dims = rnd_dims;
|
||||||
|
input_tensor_dims[0] = 1;
|
||||||
|
input_tensor_dims[2] = 1;
|
||||||
|
input_tensor_dims[4] = 1;
|
||||||
|
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||||
auto* input_data = GenerateRandomData<float>(input_tensor_size);
|
auto* input_data = GenerateRandomData<float>(input_tensor_size);
|
||||||
|
|
||||||
DSizes<Index, 5> output_tensor_dims(3, 2, 3, 3, 2);
|
DSizes<Index, 5> output_tensor_dims = rnd_dims;
|
||||||
|
|
||||||
DSizes<Index, 5> input_tensor_strides(
|
DSizes<Index, 5> input_tensor_strides(
|
||||||
ComputeStrides<Layout, 5>(input_tensor_dims));
|
ComputeStrides<Layout, 5>(input_tensor_dims));
|
||||||
@ -401,9 +451,9 @@ static void test_block_io_zero_stride()
|
|||||||
template <int Layout>
|
template <int Layout>
|
||||||
static void test_block_io_squeeze_ones() {
|
static void test_block_io_squeeze_ones() {
|
||||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||||
typedef internal::TensorBlockReader<float, Index, 5, Layout, true>
|
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout, true>
|
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
// Total size > 1.
|
// Total size > 1.
|
||||||
@ -467,23 +517,23 @@ static void test_block_io_squeeze_ones() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int Layout>
|
template <typename T, int NumDims, int Layout>
|
||||||
static void test_block_cwise_binary_io_basic() {
|
static void test_block_cwise_binary_io_basic() {
|
||||||
typedef internal::scalar_sum_op<float> BinaryFunctor;
|
typedef internal::scalar_sum_op<T> BinaryFunctor;
|
||||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
|
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
|
||||||
Layout>
|
Layout>
|
||||||
TensorBlockCwiseBinaryIO;
|
TensorBlockCwiseBinaryIO;
|
||||||
|
|
||||||
DSizes<Index, 5> block_sizes(2, 3, 5, 7, 11);
|
DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
|
||||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
|
||||||
|
|
||||||
const auto total_size = block_sizes.TotalSize();
|
const auto total_size = block_sizes.TotalSize();
|
||||||
|
|
||||||
// Create a random input tensors.
|
// Create a random input tensors.
|
||||||
auto* left_data = GenerateRandomData<float>(total_size);
|
T* left_data = GenerateRandomData<T>(total_size);
|
||||||
auto* right_data = GenerateRandomData<float>(total_size);
|
T* right_data = GenerateRandomData<T>(total_size);
|
||||||
|
|
||||||
auto* output_data = new float[total_size];
|
T* output_data = new T[total_size];
|
||||||
BinaryFunctor functor;
|
BinaryFunctor functor;
|
||||||
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
||||||
strides, left_data, strides, right_data);
|
strides, left_data, strides, right_data);
|
||||||
@ -532,13 +582,22 @@ static void test_block_cwise_binary_io_zero_strides() {
|
|||||||
Layout>
|
Layout>
|
||||||
TensorBlockCwiseBinaryIO;
|
TensorBlockCwiseBinaryIO;
|
||||||
|
|
||||||
DSizes<Index, 5> left_sizes(1, 3, 1, 7, 1);
|
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||||
|
|
||||||
|
DSizes<Index, 5> left_sizes = rnd_dims;
|
||||||
|
left_sizes[0] = 1;
|
||||||
|
left_sizes[2] = 1;
|
||||||
|
left_sizes[4] = 1;
|
||||||
|
|
||||||
DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
|
DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
|
||||||
left_strides[0] = 0;
|
left_strides[0] = 0;
|
||||||
left_strides[2] = 0;
|
left_strides[2] = 0;
|
||||||
left_strides[4] = 0;
|
left_strides[4] = 0;
|
||||||
|
|
||||||
DSizes<Index, 5> right_sizes(2, 1, 5, 1, 11);
|
DSizes<Index, 5> right_sizes = rnd_dims;
|
||||||
|
right_sizes[1] = 0;
|
||||||
|
right_sizes[3] = 0;
|
||||||
|
|
||||||
DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
|
DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
|
||||||
right_strides[1] = 0;
|
right_strides[1] = 0;
|
||||||
right_strides[3] = 0;
|
right_strides[3] = 0;
|
||||||
@ -547,7 +606,7 @@ static void test_block_cwise_binary_io_zero_strides() {
|
|||||||
auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
|
auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
|
||||||
auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
|
auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
|
||||||
|
|
||||||
DSizes<Index, 5> output_sizes(2, 3, 5, 7, 11);
|
DSizes<Index, 5> output_sizes = rnd_dims;
|
||||||
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
|
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
|
||||||
|
|
||||||
const auto output_total_size = output_sizes.TotalSize();
|
const auto output_total_size = output_sizes.TotalSize();
|
||||||
@ -557,11 +616,11 @@ static void test_block_cwise_binary_io_zero_strides() {
|
|||||||
TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
|
TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
|
||||||
output_data, left_strides, left_data,
|
output_data, left_strides, left_data,
|
||||||
right_strides, right_data);
|
right_strides, right_data);
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < rnd_dims[0]; ++i) {
|
||||||
for (int j = 0; j < 3; ++j) {
|
for (int j = 0; j < rnd_dims[1]; ++j) {
|
||||||
for (int k = 0; k < 5; ++k) {
|
for (int k = 0; k < rnd_dims[2]; ++k) {
|
||||||
for (int l = 0; l < 7; ++l) {
|
for (int l = 0; l < rnd_dims[3]; ++l) {
|
||||||
for (int m = 0; m < 11; ++m) {
|
for (int m = 0; m < rnd_dims[4]; ++m) {
|
||||||
Index output_index = i * output_strides[0] + j * output_strides[1] +
|
Index output_index = i * output_strides[0] + j * output_strides[1] +
|
||||||
k * output_strides[2] + l * output_strides[3] +
|
k * output_strides[2] + l * output_strides[3] +
|
||||||
m * output_strides[4];
|
m * output_strides[4];
|
||||||
@ -893,31 +952,44 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CALL_SUBTEST_LAYOUTS(NAME) \
|
#define TEST_LAYOUTS(NAME) \
|
||||||
CALL_SUBTEST(NAME<ColMajor>()); \
|
CALL_SUBTEST(NAME<ColMajor>()); \
|
||||||
CALL_SUBTEST(NAME<RowMajor>())
|
CALL_SUBTEST(NAME<RowMajor>())
|
||||||
|
|
||||||
#define CALL_SUBTEST_LAYOUTS_WITH_ARG(NAME, ARG) \
|
#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
|
||||||
|
CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
|
||||||
|
|
||||||
|
#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
|
||||||
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
|
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
|
||||||
CALL_SUBTEST(NAME<RowMajor>(ARG))
|
CALL_SUBTEST(NAME<RowMajor>(ARG))
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
|
TEST_LAYOUTS(test_block_mapper_sanity);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
|
TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
|
||||||
CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);
|
TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_io_copy_data_from_source_to_target);
|
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions);
|
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_io_zero_stride);
|
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_io_squeeze_ones);
|
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_basic);
|
TEST_LAYOUTS(test_block_io_zero_stride);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
|
TEST_LAYOUTS(test_block_io_squeeze_ones);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
|
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
|
||||||
CALL_SUBTEST_LAYOUTS(test_uniform_block_shape);
|
TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
|
||||||
CALL_SUBTEST_LAYOUTS(test_skewed_inner_dim_block_shape);
|
TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
|
||||||
|
TEST_LAYOUTS(test_uniform_block_shape);
|
||||||
CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
|
TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
|
||||||
CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
|
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
|
||||||
|
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CALL_SUBTEST_LAYOUTS
|
#undef TEST_LAYOUTS
|
||||||
#undef CALL_SUBTEST_LAYOUTS_WITH_ARG
|
#undef TEST_LAYOUTS_WITH_ARG
|
@ -13,7 +13,6 @@
|
|||||||
|
|
||||||
#include <Eigen/CXX11/Tensor>
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
using Eigen::Index;
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
using Eigen::RowMajor;
|
using Eigen::RowMajor;
|
||||||
using Eigen::ColMajor;
|
using Eigen::ColMajor;
|
||||||
@ -25,9 +24,16 @@ template <typename Device, bool Vectorizable, bool Tileable, int Layout>
|
|||||||
static void test_execute_binary_expr(Device d) {
|
static void test_execute_binary_expr(Device d) {
|
||||||
// Pick a large enough tensor size to bypass small tensor block evaluation
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
||||||
// optimization.
|
// optimization.
|
||||||
Tensor<float, 3> lhs(840, 390, 37);
|
int d0 = internal::random<int>(100, 200);
|
||||||
Tensor<float, 3> rhs(840, 390, 37);
|
int d1 = internal::random<int>(100, 200);
|
||||||
Tensor<float, 3> dst(840, 390, 37);
|
int d2 = internal::random<int>(100, 200);
|
||||||
|
|
||||||
|
static constexpr int Options = 0;
|
||||||
|
using IndexType = int;
|
||||||
|
|
||||||
|
Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
|
||||||
|
Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
|
||||||
|
Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
|
||||||
|
|
||||||
lhs.setRandom();
|
lhs.setRandom();
|
||||||
rhs.setRandom();
|
rhs.setRandom();
|
||||||
@ -40,9 +46,9 @@ static void test_execute_binary_expr(Device d) {
|
|||||||
|
|
||||||
Executor::run(Assign(dst, expr), d);
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
for (int i = 0; i < 840; ++i) {
|
for (int i = 0; i < d0; ++i) {
|
||||||
for (int j = 0; j < 390; ++j) {
|
for (int j = 0; j < d1; ++j) {
|
||||||
for (int k = 0; k < 37; ++k) {
|
for (int k = 0; k < d2; ++k) {
|
||||||
float sum = lhs(i, j, k) + rhs(i, j, k);
|
float sum = lhs(i, j, k) + rhs(i, j, k);
|
||||||
VERIFY_IS_EQUAL(sum, dst(i, j, k));
|
VERIFY_IS_EQUAL(sum, dst(i, j, k));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user