From d6568425f8b384ef36da1f16be76a3977fb90846 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 31 Jul 2018 08:13:43 +0000 Subject: [PATCH 01/10] Close branch tiling_3. From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: [PATCH 02/10] Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 14 +- .../CXX11/src/Tensor/TensorBroadcasting.h | 333 +++++++++++- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 144 +++++- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 218 +++++++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 333 ++++++++++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 437 +++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 237 ++++++++- unsupported/test/cxx11_tensor_executor.cpp | 480 ++++++++++++++++-- 8 files changed, 2047 insertions(+), 149 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 84cf6d216..33c4ef5b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -214,7 +214,7 @@ class TensorBlockIO { num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); const StorageIndex block_dim_for_tensor_stride1_dim = NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; - size_t block_inner_dim_size = + Index block_inner_dim_size = NumDims == 0 ? 1 : block.block_sizes()[block_dim_for_tensor_stride1_dim]; for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { @@ -745,16 +745,15 @@ class TensorBlockMapper { if (block_shape == TensorBlockShapeType::kUniformAllDims) { // Tensor will not fit within 'min_target_size' budget: calculate tensor // block dimension sizes based on "square" dimension size target. - const size_t dim_size_target = static_cast( + const Index dim_size_target = static_cast( std::pow(static_cast(min_target_size), 1.0 / static_cast(block_dim_sizes.rank()))); - for (size_t i = 0; i < block_dim_sizes.rank(); ++i) { + for (Index i = 0; i < block_dim_sizes.rank(); ++i) { // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it // a multiple of the packet size. Note that reducing // 'block_dim_size' in this manner can increase the number of // blocks, and so will amplify any per-block overhead. - block_dim_sizes[i] = numext::mini( - dim_size_target, static_cast(tensor_dims[i])); + block_dim_sizes[i] = numext::mini(dim_size_target, tensor_dims[i]); } // Add any un-allocated coefficients to inner dimension(s). StorageIndex total_size = block_dim_sizes.TotalSize(); @@ -789,9 +788,8 @@ class TensorBlockMapper { } } - eigen_assert( - block_dim_sizes.TotalSize() >= - numext::mini(min_target_size, tensor_dims.TotalSize())); + eigen_assert(block_dim_sizes.TotalSize() >= + numext::mini(min_target_size, tensor_dims.TotalSize())); return block_dim_sizes; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 343ab6269..a851e7f55 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,16 +108,29 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, + IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { + using ScalarNoConst = typename internal::remove_const::type; + + // Block based access to the XprType (input) tensor. + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + using BroadcastDimensions = DSizes; + using BroadcastTensorBlock = internal::TensorBlock; + using BroadcastTensorBlockReader = internal::TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_broadcast(op.broadcast()), + m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. @@ -216,8 +229,7 @@ struct TensorEvaluator, Device> } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -243,11 +255,15 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return m_impl.coeff(inputIndex); + return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; @@ -263,17 +279,22 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); } } - return m_impl.coeff(inputIndex); + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); } template @@ -553,13 +574,291 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); + Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = m_impl.coeff(0); + return; + } + + // Because we only support kSkewedInnerDims blocking, block size should be + // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size + // for the first outer dim, and 1 for other outer dims. This is guaranteed + // by MergeResourceRequirements() in TensorBlock.h. + const auto& output_block_sizes = output_block->block_sizes(); + const auto& output_block_strides = output_block->block_strides(); + + // Find where outer dims start. + int outer_dim_start = 0; + Index outer_dim_size = 1, inner_dim_size = 1; + for (int i = 0; i < NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + if (i > outer_dim_start) { + eigen_assert(output_block_sizes[dim] == 1); + } else if (output_block_sizes[dim] != m_dimensions[dim]) { + eigen_assert(output_block_sizes[dim] < m_dimensions[dim]); + outer_dim_size = output_block_sizes[dim]; + } else { + inner_dim_size *= output_block_sizes[dim]; + ++outer_dim_start; + } + } + + if (inner_dim_size == 0 || outer_dim_size == 0) { + return; + } + + const auto& input_dims = m_impl.dimensions(); + + // Pre-fill input_block_sizes, broadcast_block_sizes, + // broadcast_block_strides, and broadcast_tensor_strides. Later on we will + // only modify the outer_dim_start-th dimension on these arrays. + + // Calculate the input block size for looking into the input. + Dimensions input_block_sizes; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[i] = input_dims[i]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[i] = 1; + } + } else { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[NumDims - i - 1] = 1; + } + } + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...]. + // + // - broadcast_block_strides is [output_block_strides[0], + // output_block_strides[0] * d_0, + // output_block_strides[1], + // output_block_strides[1] * d_1, + // ...]. + // + // - broadcast_tensor_strides is [output_block_strides[0], + // 0, + // output_block_strides[1], + // 0, + // ...]. + BroadcastDimensions broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides; + + for (int i = 0; i < outer_dim_start; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + const int copy_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * i + : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = + static_cast(Layout) == static_cast(ColMajor) ? copy_dim + 1 + : copy_dim - 1; + broadcast_block_sizes[copy_dim] = input_dims[dim]; + broadcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + broadcast_block_strides[copy_dim] = output_block_strides[dim]; + broadcast_block_strides[broadcast_dim] = + output_block_strides[dim] * input_dims[dim]; + broadcast_tensor_strides[copy_dim] = m_inputStrides[dim]; + broadcast_tensor_strides[broadcast_dim] = 0; + } + for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : 2 * NumDims - i - 1; + broadcast_block_sizes[dim] = 1; + broadcast_block_strides[dim] = 0; + broadcast_tensor_strides[dim] = 0; + } + + const int outer_dim = static_cast(Layout) == static_cast(ColMajor) + ? outer_dim_start + : NumDims - outer_dim_start - 1; + + if (outer_dim_size == 1) { + // We just need one block read using the ready-set values above. + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else if (input_dims[outer_dim] == 1) { + // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size. + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else { + // The general case. Let's denote the output block as x[..., + // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on + // the outer_dim_start-th dimension (< NumDims). We need to split the + // a:a+outer_dim_size into possibly 3 sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[outer_dim_start] in [a, a+outer_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start] + // in [a, a+outer_dim_size]. + // + // (3) c:a+outer_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index outer_dim_left_index = + output_block->first_coeff_index() / m_outputStrides[outer_dim]; + + // Find b and c. + const Index input_outer_dim_size = input_dims[outer_dim]; + + // First multiple after a. This is b when <= outer_dim_left_index + + // outer_dim_size. + const Index first_multiple = + divup(outer_dim_left_index, input_outer_dim_size) * + input_outer_dim_size; + + if (first_multiple <= outer_dim_left_index + outer_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = (outer_dim_left_index + outer_dim_size) / + input_outer_dim_size * input_outer_dim_size; + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + if (first_multiple > outer_dim_left_index) { + const Index head_size = first_multiple - outer_dim_left_index; + input_block_sizes[outer_dim] = head_size; + broadcast_block_sizes[copy_outer_dim] = head_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + if (first_multiple < last_multiple) { + input_block_sizes[outer_dim] = input_outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = + (last_multiple - first_multiple) / input_outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (first_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + if (last_multiple < outer_dim_left_index + outer_dim_size) { + const Index tail_size = + outer_dim_left_index + outer_dim_size - last_multiple; + input_block_sizes[outer_dim] = tail_size; + broadcast_block_sizes[copy_outer_dim] = tail_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (last_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + } else { + // b and c do not exist. + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + input_block_sizes[outer_dim] = outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( + const Dimensions& input_block_sizes, + const BroadcastDimensions& broadcast_block_sizes, + const BroadcastDimensions& broadcast_block_strides, + const BroadcastDimensions& broadcast_tensor_strides, Index offset, + TensorBlock* output_block) const { + TensorBlock input_view_block( + static_cast(Layout) == static_cast(ColMajor) + ? indexColMajor(output_block->first_coeff_index() + offset) + : indexRowMajor(output_block->first_coeff_index() + offset), + input_block_sizes, Dimensions(m_inputStrides), + Dimensions(m_inputStrides), NULL); + + internal::TensorBlockView input_block(m_device, m_impl, + input_view_block); + BroadcastTensorBlock broadcast_block( + 0, broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides, output_block->data() + offset); + + BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); + } + protected: + const Device& m_device; const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 085c05f3d..a0d039e64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,14 +144,19 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { @@ -184,6 +189,23 @@ struct TensorEvaluator, Device> } m_inputStride *= input_dims[m_dim.actualDim()]; m_inputOffset = m_stride * op.offset(); + + if (BlockAccess) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -266,6 +288,60 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function (experiment with + // using a fixed block size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block->block_sizes(); + const DSizes& output_block_strides = + output_block->block_strides(); + const Index chip_dim = m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, input_block_strides, + m_inputStrides, output_block->data()); + m_impl.block(&input_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || @@ -316,6 +392,8 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; + Index m_block_total_size_max; + DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device& m_device; @@ -342,12 +420,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -395,6 +479,50 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const OutputTensorBlock& output_block) { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block.block_sizes(); + const DSizes& output_block_strides = + output_block.block_strides(); + const Index chip_dim = this->m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Write input block. + this->m_impl.writeBlock(InputTensorBlock( + this->srcCoeff(output_block.first_coeff_index()), input_block_sizes, + input_block_strides, this->m_inputStrides, + const_cast(output_block.data()))); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72cb2d15f..4987b898b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -54,6 +54,66 @@ struct nested, 1, typename eval type; }; +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = + (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + } // end namespace internal template @@ -184,15 +244,17 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + BlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - #ifdef __SYCL_DEVICE_ONLY__ + using OutputTensorBlock = internal::TensorBlock; + +#ifdef __SYCL_DEVICE_ONLY__ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) #else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) @@ -342,6 +404,9 @@ struct TensorEvaluator, Device> } else { m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -484,6 +549,146 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + using ImagePatchCopyOp = internal::ImagePatchCopyOp; + using ImagePatchPaddingOp = internal::ImagePatchPaddingOp; + + // Calculate loop limits and various input/output dim sizes. + const DSizes& block_sizes = output_block->block_sizes(); + const bool col_major = + static_cast(Layout) == static_cast(ColMajor); + const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1]; + const Index output_depth_dim_size = + m_dimensions[col_major ? 0 : NumDims - 1]; + const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2]; + const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2]; + const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3]; + const Index block_col_stride = row_dim_size * depth_dim_size; + const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4]; + const Index outer_dim_size = + block_sizes.TotalSize() / + (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size); + + const Index patch_size = row_dim_size * col_dim_size * depth_dim_size; + const Index batch_size = patch_size * patch_index_dim_size; + + Index output_index = output_block->first_coeff_index(); + + // Loop through outer dimensions. + for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size; + ++outer_dim_index) { + const Index outer_output_base_index = outer_dim_index * batch_size; + // Find the offset of the element wrt the location of the first element. + const Index patchIndexStart = output_index / m_fastPatchStride; + const Index patchOffset = + (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth; + const Index colOffsetStart = patchOffset / m_fastColStride; + // Other ways to index this element. + const Index otherIndex = + (NumDims == 4) ? 0 : output_index / m_fastOtherStride; + const Index patch2DIndexStart = + (NumDims == 4) + ? 0 + : (output_index - otherIndex * m_otherStride) / m_fastPatchStride; + // Calculate starting depth index. + const Index depth = output_index - (output_index / m_fastOutputDepth) * + output_depth_dim_size; + const Index patch_input_base_index = + depth + otherIndex * m_patchInputStride; + + // Loop through patches. + for (Index patch_index_dim_index = 0; + patch_index_dim_index < patch_index_dim_size; + ++patch_index_dim_index) { + const Index patch_output_base_index = + outer_output_base_index + patch_index_dim_index * patch_size; + // Patch index corresponding to the passed in index. + const Index patchIndex = patchIndexStart + patch_index_dim_index; + const Index patch2DIndex = + (NumDims == 4) ? patchIndex + : patch2DIndexStart + patch_index_dim_index; + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index input_col_base = colIndex * m_col_strides; + const Index row_offset_base = + (patch2DIndex - colIndex * m_outputRows) * m_row_strides - + m_rowPaddingTop; + + // Loop through columns. + for (Index col_dim_index = 0; col_dim_index < col_dim_size; + ++col_dim_index) { + const Index col_output_base_index = + patch_output_base_index + col_dim_index * block_col_stride; + + // Calculate col index in the input original tensor. + Index colOffset = colOffsetStart + col_dim_index; + Index inputCol = + input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft; + Index origInputCol = + (m_col_inflate_strides == 1) + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + + bool pad_column = false; + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && + (inputCol != origInputCol * m_col_inflate_strides))) { + pad_column = true; + } + + const Index col_input_base_index = + patch_input_base_index + origInputCol * m_colInputStride; + const Index input_row_base = + row_offset_base + + ((patchOffset + col_dim_index * output_row_dim_size) - + colOffset * m_colStride) * + m_in_row_strides; + // Loop through rows. + for (Index row_dim_index = 0; row_dim_index < row_dim_size; + ++row_dim_index) { + const Index output_base_index = + col_output_base_index + row_dim_index * depth_dim_size; + bool pad_row = false; + Index inputIndex; + if (!pad_column) { + Index inputRow = + input_row_base + row_dim_index * m_in_row_strides; + Index origInputRow = + (m_row_inflate_strides == 1) + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) + : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && + (inputRow != origInputRow * m_row_inflate_strides))) { + pad_row = true; + } else { + inputIndex = + col_input_base_index + origInputRow * m_rowInputStride; + } + } + // Copy (or pad) along depth dimension. + if (pad_column || pad_row) { + ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue), + output_base_index, output_block->data()); + } else { + ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index, + output_block->data(), inputIndex); + } + } + } + } + output_index += m_otherStride; + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { @@ -538,6 +743,7 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOutputDepth; Scalar m_paddingValue; + Index m_block_total_size_max; TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 498488649..3747bff9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -102,27 +102,64 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + enum { - IsAligned = TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + // TODO(andydavis, wuke) Enable BlockAccess for the general case when the + // performance issue with block-based reshape is resolved. + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + using OutputTensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size // of the input tensor. eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; + if (BlockAccess) { + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -148,6 +185,140 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + if (m_impl.data() != NULL) { + OutputTensorBlockReader::Run(output_block, m_impl.data()); + return; + } + + // Calculate output block unit-stride inner dimension length. + const DSizes& output_block_sizes = + output_block->block_sizes(); + Index output_inner_dim_size = 1; + Index output_outer_dim_start = NumOutputDims; + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + output_inner_dim_size *= output_block_sizes[dim]; + if (output_block_sizes[dim] < m_dimensions[dim]) { + output_outer_dim_start = i + 1; + break; + } + } + + // Initialize output block iterator state. + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + array block_iter_state; + + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + block_iter_state[i].size = output_block_sizes[dim]; + block_iter_state[i].stride = m_outputStrides[dim]; + block_iter_state[i].span = + block_iter_state[i].stride * (block_iter_state[i].size - 1); + block_iter_state[i].count = 0; + } + + const Index output_outer_dim_size = output_block_sizes.TotalSize() / + output_inner_dim_size; + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + + Index index = output_block->first_coeff_index(); + for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) { + Index inner_idx = 0; + while (inner_idx < output_inner_dim_size) { + // Calculate input coords based on 'index'. + array input_coords; + Index idx = index; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumInputDims - 1; i > 0; --i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[0] = idx; + } else { + for (int i = 0; i < NumInputDims - 1; ++i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[NumInputDims - 1] = idx; + } + + // Calculate target input block shape, using at most + // 'output_inner_dim_size' coefficients along the input block's inner + // dimensions. + DSizes input_block_sizes; + Index num_to_allocate = output_inner_dim_size - inner_idx; + for (Index i = 0; i < NumInputDims; ++i) { + const Index dim = + static_cast(Layout) == static_cast(ColMajor) + ? i : NumInputDims - i - 1; + input_block_sizes[dim] = numext::mini( + num_to_allocate, (static_cast(input_dims[dim]) - + input_coords[dim])); + if (input_coords[dim] == 0) { + num_to_allocate /= input_block_sizes[dim]; + } else { + num_to_allocate = 1; + } + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_block_strides[i] = input_block_strides[i - 1] * + input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_block_strides[i] = input_block_strides[i + 1] * + input_block_sizes[i + 1]; + } + } + + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(index, input_block_sizes, + input_block_strides, m_inputStrides, + output_block->data() + outer_idx * + output_inner_dim_size + inner_idx); + + m_impl.block(&input_block); + + const Index input_block_total_size = input_block_sizes.TotalSize(); + index += input_block_total_size; + inner_idx += input_block_total_size; + } + eigen_assert(inner_idx == output_inner_dim_size); + index -= output_inner_dim_size; + // Update index. + for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) { + if (++block_iter_state[i].count < block_iter_state[i].size) { + index += block_iter_state[i].stride; + break; + } + block_iter_state[i].count = 0; + index -= block_iter_state[i].span; + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } @@ -155,6 +326,8 @@ struct TensorEvaluator, Device> protected: TensorEvaluator m_impl; NewDimensions m_dimensions; + DSizes m_outputStrides; + DSizes m_inputStrides; }; @@ -322,17 +495,27 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { @@ -340,6 +523,16 @@ struct TensorEvaluator, Devi eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -367,13 +560,10 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -417,7 +607,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } template @@ -427,6 +621,10 @@ struct TensorEvaluator, Devi EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + if (m_is_identity) { + return m_impl.template packet(index); + } + Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -469,9 +667,26 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + output_block->block_sizes(), + output_block->block_strides(), + Dimensions(m_inputStrides), + output_block->data()); + m_impl.block(&input_block); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { Scalar* result = m_impl.data(); @@ -544,7 +759,9 @@ struct TensorEvaluator, Devi TensorEvaluator m_impl; const Device& m_device; Dimensions m_dimensions; + bool m_is_identity; const StartIndices m_offsets; + Index m_block_total_size_max; }; @@ -557,33 +774,46 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; - enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; @@ -623,6 +853,14 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + this->m_impl.writeBlock(TensorBlock( + this->srcCoeff(block.first_coeff_index()), block.block_sizes(), + block.block_strides(), Dimensions(this->m_inputStrides), + const_cast(block.data()))); + } }; @@ -739,7 +977,13 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; + m_is_identity = true; for (size_t i = 0; i < internal::array_size::value; ++i) { + if (m_strides[i] != 1 || op.startIndices()[i] != 0 || + op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) { + m_is_identity = false; + } + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); @@ -822,11 +1066,15 @@ struct TensorEvaluator::PointerType data() const { @@ -873,6 +1121,7 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; + bool m_is_identity; TensorEvaluator m_impl; const Device& m_device; DSizes m_startIndices; // clamped startIndices @@ -916,7 +1165,11 @@ struct TensorEvaluatorm_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..05c0990dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -356,6 +356,70 @@ template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalize(accum_); } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + static const Index PacketSize = + internal::unpacket_traits::size; + + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const Index vectorized_size = + (num_values_to_reduce / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + op_.reducePacket( + internal::ploadt(&data[index + i]), + &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); } + PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + } // end namespace internal @@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -410,14 +475,19 @@ struct TensorEvaluator, static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using OutputTensorBlock = internal::TensorBlock; + using InputTensorBlock = internal::TensorBlock; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); @@ -451,11 +521,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -483,6 +555,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -492,6 +565,16 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -686,6 +769,265 @@ struct TensorEvaluator, } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'PacketSize' + // reductions. + DSizes target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumOutputDims - 1; + const Index first_preserved_dim_input_index = + m_output_to_input_dim_map[first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = + first_preserved_dim_input_index == + (static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumInputDims - 1) | + PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = + output_block->block_sizes()[first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = + target_input_block_sizes[first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = + tensor_slice_offsets[first_preserved_dim_input_index]; + + array block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + 1 + : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNoConst* input_block_data = static_cast( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = + (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (output_shard_size / PacketSize + output_shard_size % PacketSize + + PacketSize) + : output_shard_size; + typedef internal::BlockReducer BlockReducer; + BlockReducer* reducers = static_cast( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + using TensorSliceBlockMapper = + internal::TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList()); + + const Index num_outputs_to_update = + tensor_slice_extents[first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) + ? num_outputs_to_update / PacketSize + : 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved + ? preserved_dim_vector_reducer_count * PacketSize + : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (preserved_dim_vector_reducer_count + + num_outputs_to_update % PacketSize) + : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = + block_mapper.GetBlockForIndex(b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = + input_outer_dim_index * num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * PacketSize, + PacketSize, input_block.data()); + } + const Index scalar_reducer_base = + input_outer_dim_base + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_MAX + typename internal::remove_const::type + values[PacketSize]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * PacketSize; + internal::pstore( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < PacketSize; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = + output_base_index + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[scalar_reducer_base + i - + preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) @@ -722,6 +1064,12 @@ struct TensorEvaluator, template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -764,16 +1112,90 @@ struct TensorEvaluator, return startInput; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes& input_slice_sizes, + DSizes* target_input_block_sizes) const { + typedef typename internal::packet_traits::type Packet; + typedef internal::BlockReducer BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi( + static_cast(1), coeff_to_allocate / reducer_overhead); + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], alloc_size); + coeff_to_allocate = numext::maxi( + static_cast(1), + coeff_to_allocate / + ((*target_input_block_sizes)[dim] * reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + // Bitmap indicating if an input dimension is reduced or not. array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; // Subset of strides of the input tensor for the non-reduced dimensions. // Indexed by output dimensions. static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -782,6 +1204,9 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; + // Block size for tiled (aka TensorBlock) evaluation. + Index m_block_total_size_max; + // Evaluator for the input expression. TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 6b54f40ad..3add9dac0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase template struct TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Self; typedef TensorShufflingOp XprType; typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; @@ -110,44 +111,60 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; for (int i = 0; i < NumDims; ++i) { m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } } - array inputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; + m_unshuffledInputStrides[0] = 1; m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - inputStrides[NumDims - 1] = 1; + m_unshuffledInputStrides[NumDims - 1] = 1; m_outputStrides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[shuffle[i]]; + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } + + m_block_total_size_max = + numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -162,29 +179,151 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kUniformAllDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (m_impl.data() != NULL) { + // Fast path: we have direct access to the data, so shuffle as we read. + TensorBlockReader::Run(output_block, + srcCoeff(output_block->first_coeff_index()), + m_inverseShuffle, + m_unshuffledInputStrides, + m_impl.data()); + return; + } + + // Slow path: read unshuffled block from the input and shuffle in-place. + // Initialize input block sizes using input-to-output shuffle map. + DSizes input_block_sizes; + for (Index i = 0; i < NumDims; ++i) { + input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + input_block_strides[i] = + input_block_strides[i - 1] * input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + input_block_strides[i] = + input_block_strides[i + 1] * input_block_sizes[i + 1]; + } + } + + // Read input block. + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + Dimensions(m_unshuffledInputStrides), + output_block->data()); + + m_impl.block(&input_block); + + // Naive In-place shuffle: random IO but block size is O(L1 cache size). + // TODO(andydavis) Improve the performance of this in-place shuffle. + const Index total_size = input_block_sizes.TotalSize(); + std::vector bitmap(total_size, false); + ScalarNoConst* data = const_cast(output_block->data()); + const DSizes& output_block_strides = + output_block->block_strides(); + for (Index input_index = 0; input_index < total_size; ++input_index) { + if (bitmap[input_index]) { + // Coefficient at this index has already been shuffled. + continue; + } + + Index output_index = GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides); + if (output_index == input_index) { + // Coefficient already in place. + bitmap[output_index] = true; + continue; + } + + // The following loop starts at 'input_index', and shuffles + // coefficients into their shuffled location at 'output_index'. + // It skips through the array shuffling coefficients by following + // the shuffle cycle starting and ending a 'start_index'. + ScalarNoConst evicted_value; + ScalarNoConst shuffled_value = data[input_index]; + do { + evicted_value = data[output_index]; + data[output_index] = shuffled_value; + shuffled_value = evicted_value; + bitmap[output_index] = true; + output_index = GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides); + } while (output_index != input_index); + + data[output_index] = shuffled_value; + bitmap[output_index] = true; } - PacketReturnType rslt = internal::pload(values); - return rslt; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : + NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } @@ -195,27 +334,57 @@ struct TensorEvaluator, Device> EIGEN_STRONG_INLINE const TensorEvaluator& impl() const {return m_impl;} protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes& input_block_strides, + const DSizes& output_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[0]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; + bool m_is_identity; + array m_inverseShuffle; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; + array m_unshuffledInputStrides; + Index m_block_total_size_max; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; @@ -239,12 +408,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockWriter = internal::TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -265,6 +440,14 @@ struct TensorEvaluator, Device> this->coeffRef(index+i) = values[i]; } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), + this->m_inverseShuffle, + this->m_unshuffledInputStrides, this->m_impl.data()); + } }; diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 274f901ce..448f47f1d 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -18,22 +18,30 @@ using Eigen::RowMajor; using Eigen::ColMajor; // A set of tests to verify that different TensorExecutor strategies yields the -// same results for all the ops, supporting tiled execution. +// same results for all the ops, supporting tiled evaluation. + +template +static array RandomDims(int min_dim = 1, int max_dim = 20) { + array dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(min_dim, max_dim); + } + return dims; +}; + +template +static void test_execute_binary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; -template -static void test_execute_binary_expr(Device d) { // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. - int d0 = internal::random(100, 200); - int d1 = internal::random(100, 200); - int d2 = internal::random(100, 200); + auto dims = RandomDims(50 / NumDims, 100 / NumDims); - static constexpr int Options = 0; - using IndexType = int; - - Tensor lhs(d0, d1, d2); - Tensor rhs(d0, d1, d2); - Tensor dst(d0, d1, d2); + Tensor lhs(dims); + Tensor rhs(dims); + Tensor dst(dims); lhs.setRandom(); rhs.setRandom(); @@ -46,33 +54,389 @@ static void test_execute_binary_expr(Device d) { Executor::run(Assign(dst, expr), d); - for (int i = 0; i < d0; ++i) { - for (int j = 0; j < d1; ++j) { - for (int k = 0; k < d2; ++k) { - float sum = lhs(i, j, k) + rhs(i, j, k); - VERIFY_IS_EQUAL(sum, dst(i, j, k)); - } - } + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T sum = lhs.coeff(i) + rhs.coeff(i); + VERIFY_IS_EQUAL(sum, dst.coeff(i)); } } -#define CALL_SUBTEST_COMBINATIONS(NAME) \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(default_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))); \ - CALL_SUBTEST((NAME(tp_device))) +template +static void test_execute_broadcasting(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(1, 10); + Tensor src(dims); + src.setRandom(); + + const auto broadcasts = RandomDims(1, 7); + const auto expr = src.broadcast(broadcasts); + + // We assume that broadcasting on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor dst(golden.dimensions()); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +}; + +template +static void test_execute_chipping_rvalue(Device d) { + auto dims = RandomDims(1, 10); + Tensor src(dims); + src.setRandom(); + +#define TEST_CHIPPING(CHIP_DIM) \ + if (NumDims > (CHIP_DIM)) { \ + const auto offset = internal::random(0, dims[(CHIP_DIM)] - 1); \ + const auto expr = src.template chip<(CHIP_DIM)>(offset); \ + \ + Tensor golden; \ + golden = expr; \ + \ + Tensor dst(golden.dimensions()); \ + \ + using Assign = TensorAssignOp; \ + using Executor = internal::TensorExecutor; \ + \ + Executor::run(Assign(dst, expr), d); \ + \ + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ + } \ + } + + TEST_CHIPPING(0) + TEST_CHIPPING(1) + TEST_CHIPPING(2) + TEST_CHIPPING(3) + TEST_CHIPPING(4) + TEST_CHIPPING(5) + +#undef TEST_CHIPPING +}; + +template +static void test_execute_chipping_lvalue(Device d) { + auto dims = RandomDims(1, 10); + +#define TEST_CHIPPING(CHIP_DIM) \ + if (NumDims > (CHIP_DIM)) { \ + /* Generate random data that we'll assign to the chipped tensor dim. */ \ + array src_dims; \ + for (int i = 0; i < NumDims - 1; ++i) { \ + int dim = i < (CHIP_DIM) ? i : i + 1; \ + src_dims[i] = dims[dim]; \ + } \ + \ + Tensor src(src_dims); \ + src.setRandom(); \ + \ + const auto offset = internal::random(0, dims[(CHIP_DIM)] - 1); \ + \ + /* Generate random data to fill non-chipped dimensions*/ \ + Tensor random(dims); \ + random.setRandom(); \ + \ + Tensor golden(dims); \ + golden = random; \ + golden.template chip<(CHIP_DIM)>(offset) = src; \ + \ + Tensor dst(dims); \ + dst = random; \ + auto expr = dst.template chip<(CHIP_DIM)>(offset); \ + \ + using Assign = TensorAssignOp; \ + using Executor = internal::TensorExecutor; \ + \ + Executor::run(Assign(expr, src), d); \ + \ + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ + } \ + } + + TEST_CHIPPING(0) + TEST_CHIPPING(1) + TEST_CHIPPING(2) + TEST_CHIPPING(3) + TEST_CHIPPING(4) + TEST_CHIPPING(5) + +#undef TEST_CHIPPING +}; + +template +static void test_execute_shuffle_rvalue(Device d) { + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(1, 10); + Tensor src(dims); + src.setRandom(); + + // Create a random dimension re-ordering/shuffle. + std::vector shuffle; + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + const auto expr = src.shuffle(shuffle); + + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden; + golden = expr; + + // Now do the shuffling using configured tensor executor. + Tensor dst(golden.dimensions()); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template +static void test_execute_shuffle_lvalue(Device d) { + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(5, 10); + Tensor src(dims); + src.setRandom(); + + // Create a random dimension re-ordering/shuffle. + std::vector shuffle; + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + array shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; + + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden(shuffled_dims); + golden.shuffle(shuffle) = src; + + // Now do the shuffling using configured tensor executor. + Tensor dst(shuffled_dims); + + auto expr = dst.shuffle(shuffle); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(expr, src), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template +static void test_execute_reduction(Device d) +{ + static_assert(NumDims >= 2); + + static constexpr int ReducedDims = NumDims - 2; + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(5, 10); + Tensor src(dims); + src.setRandom(); + + // Pick two random and unique reduction dimensions. + int reduction0 = internal::random(0, NumDims - 1); + int reduction1 = internal::random(0, NumDims - 1); + while (reduction0 == reduction1) { + reduction1 = internal::random(0, NumDims - 1); + } + + DSizes reduction_axis; + reduction_axis[0] = reduction0; + reduction_axis[1] = reduction1; + + Tensor golden = src.sum(reduction_axis); + + // Now do the reduction using configured tensor executor. + Tensor dst(golden.dimensions()); + + auto expr = src.sum(reduction_axis); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template +static void test_execute_reshape(Device d) +{ + static_assert(NumDims >= 2); + + static constexpr int ReshapedDims = NumDims - 1; + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(5, 10); + Tensor src(dims); + src.setRandom(); + + // Multiple 0th dimension and then shuffle. + std::vector shuffle; + for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + DSizes reshaped_dims; + reshaped_dims[shuffle[0]] = dims[0] * dims[1]; + for (int i = 2; i < NumDims; ++i) reshaped_dims[shuffle[i]] = dims[i]; + + Tensor golden = src.reshape(reshaped_dims); + + // Now reshape using configured tensor executor. + Tensor dst(golden.dimensions()); + + auto expr = src.reshape(reshaped_dims); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template +static void test_execute_slice_rvalue(Device d) +{ + static_assert(NumDims >= 2); + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(5, 10); + Tensor src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + auto slice_start = DSizes(RandomDims()); + auto slice_size = DSizes(RandomDims()); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + Tensor golden = + src.slice(slice_start, slice_size); + + // Now reshape using configured tensor executor. + Tensor dst(golden.dimensions()); + + auto expr = src.slice(slice_start, slice_size); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template +static void test_execute_slice_lvalue(Device d) +{ + static_assert(NumDims >= 2); + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(5, 10); + Tensor src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + auto slice_start = DSizes(RandomDims(1, 10)); + auto slice_size = DSizes(RandomDims(1, 10)); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + Tensor slice(slice_size); + slice.setRandom(); + + // Asign a slice using default executor. + Tensor golden = src; + golden.slice(slice_start, slice_size) = slice; + + // And using configured execution strategy. + Tensor dst = src; + auto expr = dst.slice(slice_start, slice_size); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(expr, slice), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +#define CALL_SUBTEST_COMBINATIONS(NAME, T, NUM_DIMS) \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))) EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::DefaultDevice default_device; @@ -81,7 +445,49 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); - CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 2); + CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 2); + CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 5); } #undef CALL_SUBTEST_COMBINATIONS From 64abdf1d7eb17174f571751346dd0cbadcf3bc52 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 12:35:19 -0700 Subject: [PATCH 03/10] Fix typo + get rid of redundant member variables for block sizes --- .../CXX11/src/Tensor/TensorBroadcasting.h | 6 +++--- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 17 ++++++++--------- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 11 +++++------ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 +++--------- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +++------- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 18 ++++++++++-------- 6 files changed, 32 insertions(+), 42 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6dbe5a22..cca14aafd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> // Block based access to the XprType (input) tensor. using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. using BroadcastDimensions = DSizes; using BroadcastTensorBlock = internal::TensorBlock; @@ -589,8 +589,8 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); - Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7579ab507..aca2ead12 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -202,9 +202,6 @@ struct TensorEvaluator, Device> m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } } @@ -290,9 +287,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -370,13 +369,14 @@ struct TensorEvaluator, Device> { Index inputIndex; if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer + // division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -392,7 +392,6 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; - Index m_block_total_size_max; DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 39759b6c3..a8247be90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> #else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) #endif - : m_impl(op.expression(), device) + : m_device(device), m_impl(op.expression(), device) #ifdef EIGEN_USE_SYCL , m_op(op) #endif @@ -404,9 +404,6 @@ struct TensorEvaluator, Device> } else { m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -551,9 +548,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -743,8 +742,8 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOutputDepth; Scalar m_paddingValue; - Index m_block_total_size_max; + const Device& m_device; TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL // Required for SYCL in order to construct the expression tree on the device diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2630311b8..6ddded0bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -560,9 +560,6 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -672,9 +669,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -761,7 +760,6 @@ struct TensorEvaluator, Devi Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; - Index m_block_total_size_max; }; @@ -1047,9 +1045,6 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); } } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -1128,7 +1123,6 @@ struct TensorEvaluator m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - std::size_t m_block_total_size_max; //use by sycl const StartIndices m_exprStartIndices; //use by sycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c41783106..73675e7dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -572,9 +572,6 @@ struct TensorEvaluator, : (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; - - m_block_total_size_max = - numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -771,9 +768,11 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::TensorBlockShapeType::kSkewedInnerDims, - m_block_total_size_max)); + block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -1204,9 +1203,6 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; - // Block size for tiled (aka TensorBlock) evaluation. - Index m_block_total_size_max; - // Evaluator for the input expression. TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 77f47bf64..f94c1380d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -124,8 +124,11 @@ struct TensorEvaluator, Device> using TensorBlock = internal::TensorBlock; using TensorBlockReader = internal::TensorBlockReader; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_impl(op.expression(), device), + m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -162,9 +165,6 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumDims; ++i) { m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } - - m_block_total_size_max = - numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -226,9 +226,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { + auto block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, - m_block_total_size_max)); + internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -384,7 +385,8 @@ struct TensorEvaluator, Device> array, NumDims> m_fastOutputStrides; array m_inputStrides; array m_unshuffledInputStrides; - Index m_block_total_size_max; + + const Device& m_device; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; From 1b0373ae10687ecc51ad9a0bfd46aa4ee116ade1 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 1 Aug 2018 15:55:46 -0700 Subject: [PATCH 04/10] Replace all using declarations with typedefs in Tensor ops --- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 20 +++++----- .../CXX11/src/Tensor/TensorBroadcasting.h | 17 +++++--- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 16 +++++--- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 40 ++++++++++--------- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 7 ++-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 22 ++++++---- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 14 ++++--- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 16 +++++--- 8 files changed, 88 insertions(+), 64 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 33c4ef5b7..1db8d6124 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -152,11 +152,11 @@ struct TensorBlockCopyOp { const Scalar* src_base = &src_data[src_index]; Scalar* dst_base = &dst_data[dst_index]; - using Src = const Eigen::Array; - using Dst = Eigen::Array; + typedef const Eigen::Array Src; + typedef Eigen::Array Dst; - using SrcMap = Eigen::Map>; - using DstMap = Eigen::Map>; + typedef Eigen::Map> SrcMap; + typedef Eigen::Map> DstMap; const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); @@ -401,13 +401,13 @@ struct TensorBlockCwiseBinaryOp { const StorageIndex left_stride, const LeftScalar* left_data, const StorageIndex right_index, const StorageIndex right_stride, const RightScalar* right_data) { - using Lhs = const Eigen::Array; - using Rhs = const Eigen::Array; - using Out = Eigen::Array; + typedef const Eigen::Array Lhs; + typedef const Eigen::Array Rhs; + typedef Eigen::Array Out; - using LhsMap = Eigen::Map>; - using RhsMap = Eigen::Map>; - using OutMap = Eigen::Map>; + typedef Eigen::Map> LhsMap; + typedef Eigen::Map> RhsMap; + typedef Eigen::Map> OutMap; const LeftScalar* lhs_base = &left_data[left_index]; const RightScalar* rhs_base = &right_data[right_index]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index cca14aafd..a4d750885 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -115,16 +115,21 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; // Block based access to the XprType (input) tensor. - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; + // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. - using BroadcastDimensions = DSizes; - using BroadcastTensorBlock = internal::TensorBlock; - using BroadcastTensorBlockReader = internal::TensorBlockReader; + typedef DSizes BroadcastDimensions; + typedef internal::TensorBlock + BroadcastTensorBlock; + typedef internal::TensorBlockReader + BroadcastTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index aca2ead12..b4c4162ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,10 +152,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) @@ -426,10 +428,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 17008917a..f11241f83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,7 +36,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -60,7 +60,7 @@ template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, @@ -102,21 +102,22 @@ template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); Index total_size = array_prod(evaluator.dimensions()); @@ -221,7 +222,7 @@ struct EvalRange { template class TensorExecutor { public: - using StorageIndex = typename Expression::Index; + typedef typename Expression::Index StorageIndex; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { @@ -249,20 +250,21 @@ class TensorExecutor { template class TensorExecutor { public: - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; - using Evaluator = TensorEvaluator; - using StorageIndex = typename traits::Index; + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; static const int NumDims = traits::NumDimensions; static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { - using TensorBlock = - TensorBlock; - using TensorBlockMapper = - TensorBlockMapper; + typedef TensorBlock + TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; Evaluator evaluator(expr, device); StorageIndex total_size = array_prod(evaluator.dimensions()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index a8247be90..2b2f4a650 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -252,7 +252,8 @@ struct TensorEvaluator, Device> RawAccess = false }; - using OutputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; #ifdef __SYCL_DEVICE_ONLY__ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) @@ -557,8 +558,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( OutputTensorBlock* output_block) const { - using ImagePatchCopyOp = internal::ImagePatchCopyOp; - using ImagePatchPaddingOp = internal::ImagePatchPaddingOp; + typedef internal::ImagePatchCopyOp ImagePatchCopyOp; + typedef internal::ImagePatchPaddingOp ImagePatchPaddingOp; // Calculate loop limits and various input/output dim sizes. const DSizes& block_sizes = output_block->block_sizes(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddded0bd..d5b0c1237 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -123,11 +123,15 @@ struct TensorEvaluator, Device> RawAccess = TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using InputTensorBlock = internal::TensorBlock; - using OutputTensorBlock = internal::TensorBlock; - using OutputTensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + InputTensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlockReader + OutputTensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) @@ -512,9 +516,10 @@ struct TensorEvaluator, Devi RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) @@ -787,9 +792,10 @@ struct TensorEvaluator, Device> RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 73675e7dd..80f179ba4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -483,10 +483,12 @@ struct TensorEvaluator, RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using OutputTensorBlock = internal::TensorBlock; - using InputTensorBlock = internal::TensorBlock; + typedef internal::TensorBlock + OutputTensorBlock; + typedef internal::TensorBlock + InputTensorBlock; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; @@ -901,9 +903,9 @@ struct TensorEvaluator, new (&reducers[i]) BlockReducer(m_reducer); } - using TensorSliceBlockMapper = - internal::TensorSliceBlockMapper; + typedef internal::TensorSliceBlockMapper + TensorSliceBlockMapper; // TODO(andydavis) Consider removing 'input_block_stride_order' if we // find that scattered reads are not worth supporting in diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f94c1380d..fbe69aabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -119,10 +119,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockReader = internal::TensorBlockReader; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockReader + TensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -417,10 +419,12 @@ struct TensorEvaluator, Device> RawAccess = false }; - using ScalarNoConst = typename internal::remove_const::type; + typedef typename internal::remove_const::type ScalarNoConst; - using TensorBlock = internal::TensorBlock; - using TensorBlockWriter = internal::TensorBlockWriter; + typedef internal::TensorBlock + TensorBlock; + typedef internal::TensorBlockWriter + TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) From cfaedb38cd662def3b5684a20965b3bc1b0d6a3f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 9 Aug 2018 09:44:07 -0700 Subject: [PATCH 05/10] Fix bug in a test + compilation errors --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 6 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 8 ++++---- unsupported/test/cxx11_tensor_executor.cpp | 2 +- unsupported/test/cxx11_tensor_shuffling.cpp | 8 ++++---- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f11241f83..4bc18bd97 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -115,6 +115,7 @@ class TensorExecutor TensorBlock; + typedef typename TensorBlock::Dimensions TensorBlockDimensions; typedef TensorBlockMapper TensorBlockMapper; @@ -141,8 +142,9 @@ class TensorExecutor( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d5b0c1237..b25c1eabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -520,6 +520,7 @@ struct TensorEvaluator, Devi typedef internal::TensorBlock TensorBlock; + typedef typename TensorBlock::Dimensions TensorBlockDimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) @@ -687,7 +688,7 @@ struct TensorEvaluator, Devi TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), output_block->block_sizes(), output_block->block_strides(), - Dimensions(m_inputStrides), + TensorBlockDimensions(m_inputStrides), output_block->data()); m_impl.block(&input_block); } @@ -796,6 +797,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock TensorBlock; + typedef typename TensorBlock::Dimensions TensorBlockDimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) @@ -862,13 +864,11 @@ struct TensorEvaluator, Device> const TensorBlock& block) { this->m_impl.writeBlock(TensorBlock( this->srcCoeff(block.first_coeff_index()), block.block_sizes(), - block.block_strides(), Dimensions(this->m_inputStrides), + block.block_strides(), TensorBlockDimensions(this->m_inputStrides), const_cast(block.data()))); } }; - - namespace internal { template struct traits > : public traits diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 448f47f1d..d16ae4d11 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -317,7 +317,7 @@ static void test_execute_reshape(Device d) DSizes reshaped_dims; reshaped_dims[shuffle[0]] = dims[0] * dims[1]; - for (int i = 2; i < NumDims; ++i) reshaped_dims[shuffle[i]] = dims[i]; + for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1]; Tensor golden = src.reshape(reshaped_dims); diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index ab19b6e6b..467df39c7 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -83,10 +83,10 @@ static void test_expr_shuffling() Tensor result(5,7,3,2); - array src_slice_dim{{2,3,1,7}}; - array src_slice_start{{0,0,0,0}}; - array dst_slice_dim{{1,7,3,2}}; - array dst_slice_start{{0,0,0,0}}; + array src_slice_dim{{2,3,1,7}}; + array src_slice_start{{0,0,0,0}}; + array dst_slice_dim{{1,7,3,2}}; + array dst_slice_start{{0,0,0,0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = From f2209d06e428e0691de71f30fc2db4cb29191cd2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 10 Aug 2018 16:53:36 -0700 Subject: [PATCH 06/10] Add block evaluationto CwiseUnaryOp and add PreferBlockAccess enum to all evaluators --- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 + .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 1 + .../Eigen/CXX11/src/Tensor/TensorAssign.h | 18 ++- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 141 ++++++++++++++++++ .../CXX11/src/Tensor/TensorBroadcasting.h | 11 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 13 +- .../CXX11/src/Tensor/TensorConcatenation.h | 2 + .../CXX11/src/Tensor/TensorContraction.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + .../CXX11/src/Tensor/TensorConvolution.h | 2 + .../CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 + .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 +++++++--- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 1 + .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 1 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + .../src/Tensor/TensorForwardDeclarations.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 1 + .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 46 +++--- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + .../Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorRef.h | 3 + .../Eigen/CXX11/src/Tensor/TensorReverse.h | 2 + .../Eigen/CXX11/src/Tensor/TensorScan.h | 1 + .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 24 +-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 2 + .../Eigen/CXX11/src/Tensor/TensorTrace.h | 1 + .../CXX11/src/Tensor/TensorVolumePatch.h | 1 + .../test/cxx11_tensor_block_access.cpp | 111 ++++++++++++++ unsupported/test/cxx11_tensor_executor.cpp | 31 ++++ 35 files changed, 454 insertions(+), 82 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c0f33ba2d..ea3ab329d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -87,6 +87,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -220,6 +221,7 @@ struct TensorEvaluator, Devi IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 442639868..5110e99ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -109,6 +109,7 @@ struct TensorEvaluator, Sy IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f1f877c16..9ec1ec726 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -102,14 +102,16 @@ struct TensorEvaluator, Device> static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::TensorBlock< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 877603421..4a3e1ac17 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -381,6 +381,147 @@ class TensorBlockWriter : public TensorBlockIO + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex input_index, + const StorageIndex input_stride, const InputScalar* input_data) { + typedef const Eigen::Array Input; + typedef Eigen::Array Output; + + typedef Eigen::Map> InputMap; + typedef Eigen::Map> OutputMap; + + const InputScalar* input_base = &input_data[input_index]; + OutputScalar* output_base = &output_data[output_index]; + + const InputMap input(input_base, num_coeff, InnerStride<>(input_stride)); + OutputMap output(output_base, num_coeff, InnerStride<>(output_stride)); + + output = Eigen::CwiseUnaryOp(input, functor); + } +}; + +/** + * \class TensorBlockCwiseUnaryIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class for carrying out cwise unary ops. + * + * This class carries out the unary op on given blocks. + */ +template +struct TensorBlockCwiseUnaryIO { + typedef typename internal::TensorBlock::Dimensions Dimensions; + + struct BlockIteratorState { + StorageIndex output_stride, output_span; + StorageIndex input_stride, input_span; + StorageIndex size, count; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const UnaryFunctor& functor, const Dimensions& block_sizes, + const Dimensions& block_strides, OutputScalar* output_data, + const array& input_strides, + const InputScalar* input_data) { + // Find the innermost dimension whose size is not 1. This is the effective + // inner dim. If all dimensions are of size 1, fallback to using the actual + // innermost dim to avoid out-of-bound access. + int num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + if (block_sizes[dim] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const int inner_dim = + NumDims == 0 ? 1 + : cond()(num_size_one_inner_dims, + NumDims - num_size_one_inner_dims - 1); + StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond()(i, NumDims - i - 1); + // Merge multiple inner dims into one for larger inner dim size (i.e. + // fewer calls to TensorBlockCwiseUnaryOp::Run()). + if (inner_dim_size == block_strides[dim] && + block_strides[dim] == input_strides[dim]) { + inner_dim_size *= block_sizes[dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + StorageIndex output_index = 0, input_index = 0; + + const StorageIndex output_stride = + NumDims == 0 ? 1 : block_strides[inner_dim]; + const StorageIndex input_stride = + NumDims == 0 ? 1 : input_strides[inner_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond()(i + 1, NumDims - i - 2); + const StorageIndex size = block_sizes[dim]; + if (size == 1) { + continue; + } + BlockIteratorState& state = block_iter_state[num_squeezed_dims]; + state.output_stride = block_strides[dim]; + state.input_stride = input_strides[dim]; + state.size = size; + state.output_span = state.output_stride * (size - 1); + state.input_span = state.input_stride * (size - 1); + state.count = 0; + ++num_squeezed_dims; + } + + // Compute cwise unary op. + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { + TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index, + output_stride, output_data, input_index, + input_stride, input_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + auto& state = block_iter_state[j]; + if (++state.count < state.size) { + output_index += state.output_stride; + input_index += state.input_stride; + break; + } + state.count = 0; + output_index -= state.output_span; + input_index -= state.input_span; + } + } + } +}; + /** * \class TensorBlockCwiseBinaryOp * \ingroup CXX11_Tensor_Module diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a4d750885..5e812b04d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,11 +108,12 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b4c4162ef..76fab39e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,12 +144,13 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 27c92d8f6..3863ee8c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -123,6 +123,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; @@ -308,6 +309,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e604456e8..c459fc649 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -241,6 +241,7 @@ struct TensorContractionEvaluatorBase IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a7751eee1..1f613d3c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -196,6 +196,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a07e32db0..2d0e6599f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -308,6 +308,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -780,6 +781,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index d301d0c01..e79958fc9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -243,6 +243,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 47b5a5a5e..d71b2e34b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -93,6 +93,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 256d499f2..554ee5f59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,6 +108,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 028902fea..e30f9ad8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -195,6 +196,7 @@ struct TensorEvaluator IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true @@ -288,6 +290,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -351,27 +354,34 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef internal::TensorBlock + TensorBlock; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -399,6 +409,29 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_argImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = coeff(0); + return; + } + internal::TensorBlockView arg_block(m_device, m_argImpl, + *output_block); + internal::TensorBlockCwiseUnaryIO::Run(m_functor, + output_block->block_sizes(), + output_block + ->block_strides(), + output_block->data(), + arg_block.block_strides(), + arg_block.data()); + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor @@ -408,6 +441,7 @@ struct TensorEvaluator, Device> private: + const Device& m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -421,16 +455,18 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess | + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -501,7 +537,7 @@ struct TensorEvaluatordata()[0] = coeff(0); + output_block->data()[0] = coeff(Index(0)); return; } internal::TensorBlockView left_block( @@ -543,6 +579,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -648,6 +685,7 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6ab4d997..480cf1f39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -136,6 +136,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 1342e47a6..71ba56773 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 2778bf5ec..edf9f85d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -99,6 +99,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 0dd524a30..b8a57ab99 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -132,7 +132,12 @@ struct IsVectorizable { template struct IsTileable { - static const bool value = TensorEvaluator::BlockAccess; + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool value = + TensorEvaluator::BlockAccess & + TensorEvaluator::PreferBlockAccess; }; template , Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2b2f4a650..1826d7022 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -244,12 +244,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef internal::TensorBlock diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 64f2ad81f..e28565009 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,6 +91,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index e3165fa10..998757d14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -120,6 +120,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -183,6 +184,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index b25c1eabc..6f9294ccf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -111,16 +111,17 @@ struct TensorEvaluator, Device> static const int NumInputDims = internal::array_size::Dimensions>::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, // TODO(andydavis, wuke) Enable BlockAccess for the general case when the // performance issue with block-based reshape is resolved. - BlockAccess = TensorEvaluator::BlockAccess && - TensorEvaluator::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -349,6 +350,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -508,12 +510,13 @@ struct TensorEvaluator, Devi enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -785,12 +788,13 @@ struct TensorEvaluator, Device> typedef Sizes Dimensions; enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; typedef typename internal::remove_const::type ScalarNoConst; @@ -972,6 +976,7 @@ struct TensorEvaluator::Layout, RawAccess = false }; @@ -1148,6 +1153,7 @@ struct TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index aa1db3c73..59c1704ed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = true, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index a0a1ad8f4..4292fe0c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 80f179ba4..3d534eaa2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -475,12 +475,13 @@ struct TensorEvaluator, static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index a6cade50f..6e15e75f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -137,6 +137,7 @@ template class TensorRef : public TensorBase, Device> IsAligned = false, PacketAccess = false, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -414,6 +416,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator, Device IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -255,6 +256,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index b1135f297..641366d9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> { IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 98f125408..a5b541a68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -111,12 +111,13 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -412,11 +413,12 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 4b69072f2..221dc96c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -275,6 +276,7 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index ea53bb04b..9fc54a4c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -96,6 +96,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 3c7d8bbc0..c1b7a58ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -200,6 +200,7 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 6feeff231..746f62511 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -517,6 +517,114 @@ static void test_block_io_squeeze_ones() { } } +template +static void test_block_cwise_unary_io_basic() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes = RandomDims(); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + T* input_data = GenerateRandomData(total_size); + + T* output_data = new T[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_squeeze_ones() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes block_sizes(1, 2, 1, 3, 1); + DSizes strides(ComputeStrides(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* input_data = GenerateRandomData(total_size); + + auto* output_data = new float[total_size]; + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, + strides, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); + } + + delete[] input_data; + delete[] output_data; +} + +template +static void test_block_cwise_unary_io_zero_strides() { + typedef internal::scalar_square_op UnaryFunctor; + typedef internal::TensorBlockCwiseUnaryIO + TensorBlockCwiseUnaryIO; + + DSizes rnd_dims = RandomDims<5>(); + + DSizes input_sizes = rnd_dims; + input_sizes[0] = 1; + input_sizes[2] = 1; + input_sizes[4] = 1; + + DSizes input_strides(ComputeStrides(input_sizes)); + input_strides[0] = 0; + input_strides[2] = 0; + input_strides[4] = 0; + + // Generate random data. + auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + + DSizes output_sizes = rnd_dims; + DSizes output_strides(ComputeStrides(output_sizes)); + + const auto output_total_size = output_sizes.TotalSize(); + auto* output_data = new float[output_total_size]; + + UnaryFunctor functor; + TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, + output_data, input_strides, input_data); + for (int i = 0; i < rnd_dims[0]; ++i) { + for (int j = 0; j < rnd_dims[1]; ++j) { + for (int k = 0; k < rnd_dims[2]; ++k) { + for (int l = 0; l < rnd_dims[3]; ++l) { + for (int m = 0; m < rnd_dims[4]; ++m) { + Index output_index = i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4]; + Index input_index = i * input_strides[0] + j * input_strides[1] + + k * input_strides[2] + l * input_strides[3] + + m * input_strides[4]; + VERIFY_IS_EQUAL(output_data[output_index], + functor(input_data[input_index])); + } + } + } + } + } + + delete[] input_data; + delete[] output_data; +} + template static void test_block_cwise_binary_io_basic() { typedef internal::scalar_sum_op BinaryFunctor; @@ -982,6 +1090,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); TEST_LAYOUTS(test_block_io_zero_stride); TEST_LAYOUTS(test_block_io_squeeze_ones); + TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic); + TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones); + TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides); TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index d16ae4d11..1dc18220c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -29,6 +29,33 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { return dims; }; +template +static void test_execute_unary_expr(Device d) { + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims(50 / NumDims, 100 / NumDims); + + Tensor src(dims); + Tensor dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + template static void test_execute_binary_expr(Device d) @@ -445,6 +472,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5); + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4); CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5); From 35d90e89600ff2524ec8bdd4ef4b95dd7c78b656 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 10 Aug 2018 17:37:58 -0700 Subject: [PATCH 07/10] Fix BlockAccess enum in CwiseUnaryOp evaluator --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index e30f9ad8e..4ca6b3d8c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -357,7 +357,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b8a57ab99..b7a0193fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -135,9 +135,8 @@ struct IsTileable { // Check that block evaluation is supported and it's a preferred option (at // least one sub-expression has much faster block evaluation, e.g. // broadcasting). - static const bool value = - TensorEvaluator::BlockAccess & - TensorEvaluator::PreferBlockAccess; + static const bool value = TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; }; template Date: Tue, 11 Sep 2018 13:32:32 -0700 Subject: [PATCH 08/10] Fix compilation of tiled evaluation code with c++03 --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 6 +++--- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +++++------ .../Eigen/CXX11/src/Tensor/TensorChipping.h | 5 ++--- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 5 ++--- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 ++--- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 5 ++--- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 ++-- unsupported/test/cxx11_tensor_block_access.cpp | 14 +++++++------- unsupported/test/cxx11_tensor_shuffling.cpp | 10 +++++----- 9 files changed, 30 insertions(+), 35 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 6d90af2d3..13da36257 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -396,8 +396,8 @@ struct TensorBlockCwiseUnaryOp { typedef const Eigen::Array Input; typedef Eigen::Array Output; - typedef Eigen::Map> InputMap; - typedef Eigen::Map> OutputMap; + typedef Eigen::Map > InputMap; + typedef Eigen::Map > OutputMap; const InputScalar* input_base = &input_data[input_index]; OutputScalar* output_base = &output_data[output_index]; @@ -502,7 +502,7 @@ struct TensorBlockCwiseUnaryIO { input_stride, input_data); // Update index. for (int j = 0; j < num_squeezed_dims; ++j) { - auto& state = block_iter_state[j]; + BlockIteratorState& state = block_iter_state[j]; if (++state.count < state.size) { output_index += state.output_stride; input_index += state.input_stride; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 02d061a9c..e5cf93ab0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -596,12 +596,11 @@ struct TensorEvaluator, Device> std::vector* resources) const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } @@ -617,8 +616,8 @@ struct TensorEvaluator, Device> // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size // for the first outer dim, and 1 for other outer dims. This is guaranteed // by MergeResourceRequirements() in TensorBlock.h. - const auto& output_block_sizes = output_block->block_sizes(); - const auto& output_block_strides = output_block->block_strides(); + const Dimensions& output_block_sizes = output_block->block_sizes(); + const Dimensions& output_block_strides = output_block->block_strides(); // Find where outer dims start. int outer_dim_start = 0; @@ -642,7 +641,7 @@ struct TensorEvaluator, Device> return; } - const auto& input_dims = m_impl.dimensions(); + const Dimensions& input_dims = m_impl.dimensions(); // Pre-fill input_block_sizes, broadcast_block_sizes, // broadcast_block_strides, and broadcast_tensor_strides. Later on we will diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 76fab39e2..b47fa9e8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -290,11 +290,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 1826d7022..965bd8f1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -550,11 +550,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2f765acb7..16dc74afe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -677,11 +677,10 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 3d534eaa2..eeb2578fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -771,11 +771,10 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kSkewedInnerDims, - block_total_size_max)); + internal::kSkewedInnerDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index a5b541a68..e018d0ab2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -229,10 +229,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector* resources) const { - auto block_total_size_max = numext::maxi( + Eigen::Index block_total_size_max = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( - internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); + internal::kUniformAllDims, block_total_size_max)); m_impl.getResourceRequirements(resources); } diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index eec282ba7..ad12ae557 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -535,7 +535,7 @@ static void test_block_cwise_unary_io_basic() { DSizes block_sizes = RandomDims(); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. T* input_data = GenerateRandomData(total_size); @@ -562,12 +562,12 @@ static void test_block_cwise_unary_io_squeeze_ones() { DSizes block_sizes(1, 2, 1, 3, 1); DSizes strides(ComputeStrides(block_sizes)); - const auto total_size = block_sizes.TotalSize(); + const Index total_size = block_sizes.TotalSize(); // Create a random input tensors. - auto* input_data = GenerateRandomData(total_size); + float* input_data = GenerateRandomData(total_size); - auto* output_data = new float[total_size]; + float* output_data = new float[total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, strides, input_data); @@ -599,13 +599,13 @@ static void test_block_cwise_unary_io_zero_strides() { input_strides[4] = 0; // Generate random data. - auto* input_data = GenerateRandomData(input_sizes.TotalSize()); + float* input_data = GenerateRandomData(input_sizes.TotalSize()); DSizes output_sizes = rnd_dims; DSizes output_strides(ComputeStrides(output_sizes)); - const auto output_total_size = output_sizes.TotalSize(); - auto* output_data = new float[output_total_size]; + const Index output_total_size = output_sizes.TotalSize(); + float* output_data = new float[output_total_size]; UnaryFunctor functor; TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 467df39c7..062dd1c0f 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -81,12 +81,12 @@ static void test_expr_shuffling() Tensor expected; expected = tensor.shuffle(shuffles); - Tensor result(5,7,3,2); + Tensor result(5, 7, 3, 2); - array src_slice_dim{{2,3,1,7}}; - array src_slice_start{{0,0,0,0}}; - array dst_slice_dim{{1,7,3,2}}; - array dst_slice_start{{0,0,0,0}}; + array src_slice_dim({2, 3, 1, 7}); + array src_slice_start({0, 0, 0, 0}); + array dst_slice_dim({1, 7, 3, 2}); + array dst_slice_start({0, 0, 0, 0}); for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = From d138fe341dab4b7c1a7b7d012a3e13e1796aad8f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 11 Sep 2018 17:23:18 -0700 Subject: [PATCH 09/10] Fis static_assert in test to conform c++11 standard --- unsupported/test/cxx11_tensor_executor.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 1dc18220c..8639e7b38 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -286,7 +286,7 @@ template static void test_execute_reduction(Device d) { - static_assert(NumDims >= 2); + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int ReducedDims = NumDims - 2; static constexpr int Options = 0 | Layout; @@ -328,7 +328,7 @@ template static void test_execute_reshape(Device d) { - static_assert(NumDims >= 2); + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int ReshapedDims = NumDims - 1; static constexpr int Options = 0 | Layout; @@ -368,7 +368,7 @@ template static void test_execute_slice_rvalue(Device d) { - static_assert(NumDims >= 2); + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); @@ -408,7 +408,7 @@ template static void test_execute_slice_lvalue(Device d) { - static_assert(NumDims >= 2); + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); From 01197e44527941c95f9a63e4f60ab3a989f12cbe Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 13 Sep 2018 15:03:36 -0700 Subject: [PATCH 10/10] Fix warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/test/cxx11_tensor_executor.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eeb2578fd..261dbffa8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -1140,7 +1140,6 @@ struct TensorEvaluator, const Index max_coeff_count, const DSizes& input_slice_sizes, DSizes* target_input_block_sizes) const { - typedef typename internal::packet_traits::type Packet; typedef internal::BlockReducer BlockReducer; // TODO(andydavis) Compute reducer overhead correctly for the case where // we are preserving the inner most dimension, and a single reducer @@ -1168,7 +1167,6 @@ struct TensorEvaluator, } else if (!first_preserved_dim_allocated) { // TODO(andydavis) Include output block size in this L1 working set // calculation. - const Index allocated = max_coeff_count - coeff_to_allocate; const Index alloc_size = numext::maxi( static_cast(1), coeff_to_allocate / reducer_overhead); (*target_input_block_sizes)[dim] = diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 8639e7b38..1bb99854c 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -27,7 +27,7 @@ static array RandomDims(int min_dim = 1, int max_dim = 20) { dims[i] = internal::random(min_dim, max_dim); } return dims; -}; +} template @@ -117,7 +117,7 @@ static void test_execute_broadcasting(Device d) for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } -}; +} template @@ -155,7 +155,7 @@ static void test_execute_chipping_rvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template @@ -207,7 +207,7 @@ static void test_execute_chipping_lvalue(Device d) { TEST_CHIPPING(5) #undef TEST_CHIPPING -}; +} template