mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Add tiled evaluation support to TensorExecutor
This commit is contained in:
parent
d55efa6f0f
commit
6913221c43
@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#include "src/Tensor/TensorGlobalFunctions.h"
|
#include "src/Tensor/TensorGlobalFunctions.h"
|
||||||
|
|
||||||
#include "src/Tensor/TensorBase.h"
|
#include "src/Tensor/TensorBase.h"
|
||||||
|
#include "src/Tensor/TensorBlock.h"
|
||||||
|
|
||||||
#include "src/Tensor/TensorEvaluator.h"
|
#include "src/Tensor/TensorEvaluator.h"
|
||||||
#include "src/Tensor/TensorExpr.h"
|
#include "src/Tensor/TensorExpr.h"
|
||||||
#include "src/Tensor/TensorReduction.h"
|
#include "src/Tensor/TensorReduction.h"
|
||||||
#include "src/Tensor/TensorReductionGpu.h"
|
#include "src/Tensor/TensorReductionGpu.h"
|
||||||
#include "src/Tensor/TensorArgMax.h"
|
#include "src/Tensor/TensorArgMax.h"
|
||||||
#include "src/Tensor/TensorBlock.h"
|
|
||||||
#include "src/Tensor/TensorConcatenation.h"
|
#include "src/Tensor/TensorConcatenation.h"
|
||||||
#include "src/Tensor/TensorContractionMapper.h"
|
#include "src/Tensor/TensorContractionMapper.h"
|
||||||
#include "src/Tensor/TensorContractionBlocking.h"
|
#include "src/Tensor/TensorContractionBlocking.h"
|
||||||
|
@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
|
|||||||
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
|
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
|
||||||
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
|
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
|
||||||
|
|
||||||
|
static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
|
||||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
|
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
|
||||||
|
|
||||||
@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
|||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||||
|
|
||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
|
static const int NumDims = XprType::NumDims;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
|
||||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||||
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
|
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||||
|
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||||
|
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||||
|
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef typename internal::TensorBlock<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
|
||||||
|
TensorBlock;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||||
m_leftImpl(op.lhsExpression(), device),
|
m_leftImpl(op.lhsExpression(), device),
|
||||||
m_rightImpl(op.rhsExpression(), device)
|
m_rightImpl(op.rhsExpression(), device)
|
||||||
{
|
{
|
||||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
EIGEN_STATIC_ASSERT(
|
||||||
|
(static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
|
||||||
|
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||||
|
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||||
@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
|||||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
m_leftImpl.getResourceRequirements(resources);
|
||||||
|
m_rightImpl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
|
||||||
|
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
|
||||||
|
m_leftImpl.data() != nullptr) {
|
||||||
|
TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
|
||||||
|
block->tensor_strides(), block->tensor_strides(),
|
||||||
|
m_leftImpl.data() + block->first_coeff_index());
|
||||||
|
m_rightImpl.block(&left_block);
|
||||||
|
} else {
|
||||||
|
m_rightImpl.block(block);
|
||||||
|
m_leftImpl.writeBlock(*block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// required by sycl in order to extract the accessor
|
/// required by sycl in order to extract the accessor
|
||||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||||
/// required by sycl in order to extract the accessor
|
/// required by sycl in order to extract the accessor
|
||||||
|
@ -65,6 +65,40 @@ enum class TensorBlockShapeType {
|
|||||||
kSkewedInnerDims,
|
kSkewedInnerDims,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct TensorOpResourceRequirements {
|
||||||
|
TensorBlockShapeType block_shape;
|
||||||
|
std::size_t block_total_size;
|
||||||
|
// TODO(andydavis) Add 'target_num_threads' to support communication of
|
||||||
|
// thread-resource requirements. This will allow ops deep in the
|
||||||
|
// expression tree (like reductions) to communicate resources
|
||||||
|
// requirements based on local state (like the total number of reductions
|
||||||
|
// to be computed).
|
||||||
|
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
|
||||||
|
const std::size_t size)
|
||||||
|
: block_shape(shape), block_total_size(size) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Tries to merge multiple resource requirements.
|
||||||
|
EIGEN_STRONG_INLINE void MergeResourceRequirements(
|
||||||
|
const std::vector<TensorOpResourceRequirements>& resources,
|
||||||
|
TensorBlockShapeType* block_shape, std::size_t* block_total_size) {
|
||||||
|
if (resources.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// TODO(andydavis) Implement different policies (i.e. revert to a default
|
||||||
|
// policy if block shapes/sizes conflict).
|
||||||
|
*block_shape = resources[0].block_shape;
|
||||||
|
*block_total_size = resources[0].block_total_size;
|
||||||
|
for (int i = 1; i < resources.size(); ++i) {
|
||||||
|
if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
|
||||||
|
*block_shape != TensorBlockShapeType::kSkewedInnerDims) {
|
||||||
|
*block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||||
|
}
|
||||||
|
*block_total_size =
|
||||||
|
numext::maxi(*block_total_size, resources[i].block_total_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class TensorBlock
|
* \class TensorBlock
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
@ -74,7 +108,7 @@ enum class TensorBlockShapeType {
|
|||||||
* This class represents a tensor block specified by the index of the
|
* This class represents a tensor block specified by the index of the
|
||||||
* first block coefficient, and the size of the block in each dimension.
|
* first block coefficient, and the size of the block in each dimension.
|
||||||
*/
|
*/
|
||||||
template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
|
template <typename Scalar, typename Index, int NumDims, int Layout>
|
||||||
class TensorBlock {
|
class TensorBlock {
|
||||||
public:
|
public:
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<Index, NumDims> Dimensions;
|
||||||
@ -614,6 +648,83 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \class TensorBlockView
|
||||||
|
* \ingroup CXX11_Tensor_Module
|
||||||
|
*
|
||||||
|
* \brief Read-only view into a block of data.
|
||||||
|
*
|
||||||
|
* This class provides read-only access to a block of data in impl. It may need
|
||||||
|
* to allocate space for holding the intermediate result.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
template <class ArgType, class Device>
|
||||||
|
struct TensorBlockView {
|
||||||
|
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||||
|
typedef typename Impl::Index Index;
|
||||||
|
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
|
||||||
|
static const int NumDims = array_size<typename Impl::Dimensions>::value;
|
||||||
|
typedef DSizes<Index, NumDims> Dimensions;
|
||||||
|
|
||||||
|
// Constructs a TensorBlockView for `impl`. `block` is only used for for
|
||||||
|
// specifying the start offset, shape, and strides of the block.
|
||||||
|
template <typename OtherTensorBlock>
|
||||||
|
TensorBlockView(const Device& device,
|
||||||
|
const TensorEvaluator<ArgType, Device>& impl,
|
||||||
|
const OtherTensorBlock& block)
|
||||||
|
: m_device(device),
|
||||||
|
m_block_sizes(block.block_sizes()),
|
||||||
|
m_data(NULL),
|
||||||
|
m_allocated_data(NULL) {
|
||||||
|
if (Impl::RawAccess && impl.data() != NULL) {
|
||||||
|
m_data = impl.data() + block.first_coeff_index();
|
||||||
|
m_block_strides = block.tensor_strides();
|
||||||
|
} else {
|
||||||
|
// Actually make a copy.
|
||||||
|
|
||||||
|
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
|
||||||
|
// Consider allowing ops to request additional temporary block memory in
|
||||||
|
// TensorOpResourceRequirements.
|
||||||
|
m_allocated_data = static_cast<Scalar*>(
|
||||||
|
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
|
||||||
|
m_data = m_allocated_data;
|
||||||
|
if (NumDims > 0) {
|
||||||
|
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
m_block_strides[0] = 1;
|
||||||
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
|
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
m_block_strides[NumDims - 1] = 1;
|
||||||
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
|
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TensorBlock<Scalar, Index, NumDims, Impl::Layout> input_block(
|
||||||
|
block.first_coeff_index(), m_block_sizes, m_block_strides,
|
||||||
|
block.tensor_strides(), m_allocated_data);
|
||||||
|
impl.block(&input_block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~TensorBlockView() {
|
||||||
|
if (m_allocated_data != NULL) {
|
||||||
|
m_device.deallocate(m_allocated_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const Dimensions& block_sizes() const { return m_block_sizes; }
|
||||||
|
const Dimensions& block_strides() const { return m_block_strides; }
|
||||||
|
const Scalar* data() const { return m_data; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const Device& m_device;
|
||||||
|
Dimensions m_block_sizes, m_block_strides;
|
||||||
|
const Scalar* m_data; // Not owned.
|
||||||
|
Scalar* m_allocated_data; // Owned.
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class TensorBlockMapper
|
* \class TensorBlockMapper
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
// This file is part of Eigen, a lightweight C++ template library
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
// for linear algebra.
|
// for linear algebra.
|
||||||
//
|
//
|
||||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||||
@ -110,6 +111,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
// slice offsets.
|
// slice offsets.
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
|
@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = true,
|
PacketAccess = true,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<InputArgType, Device>::Layout,
|
Layout = TensorEvaluator<InputArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
||||||
PacketAccess = false,
|
PacketAccess = false,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
|
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
||||||
PacketAccess = false,
|
PacketAccess = false,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||||
|
template <typename std::ptrdiff_t... Indices>
|
||||||
|
EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
|
||||||
|
for (int i = 0 ; i < NumDims; ++i) {
|
||||||
|
(*this)[i] = a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
|
||||||
|
EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
|
||||||
|
for (int i = 0 ; i < NumDims; ++i) {
|
||||||
|
(*this)[i] = a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||||
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
|
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
|
||||||
|
@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
|
@ -41,11 +41,24 @@ struct TensorEvaluator
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = Derived::IsAligned,
|
IsAligned = Derived::IsAligned,
|
||||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||||
|
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||||
Layout = Derived::Layout,
|
Layout = Derived::Layout,
|
||||||
CoordAccess = NumCoords > 0,
|
CoordAccess = NumCoords > 0,
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef typename internal::TensorBlock<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
|
TensorBlock;
|
||||||
|
typedef typename internal::TensorBlockReader<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
||||||
|
PacketAccess>
|
||||||
|
TensorBlockReader;
|
||||||
|
typedef typename internal::TensorBlockWriter<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
||||||
|
PacketAccess>
|
||||||
|
TensorBlockWriter;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||||
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
|
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
|
||||||
{ }
|
{ }
|
||||||
@ -113,6 +126,20 @@ struct TensorEvaluator
|
|||||||
internal::unpacket_traits<PacketReturnType>::size);
|
internal::unpacket_traits<PacketReturnType>::size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||||
|
assert(m_data != NULL);
|
||||||
|
TensorBlockReader::Run(block, m_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||||
|
const TensorBlock& block) {
|
||||||
|
assert(m_data != NULL);
|
||||||
|
TensorBlockWriter::Run(block, m_data);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
|
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
|
||||||
|
|
||||||
/// required by sycl in order to construct sycl buffer from raw pointer
|
/// required by sycl in order to construct sycl buffer from raw pointer
|
||||||
@ -167,11 +194,20 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = Derived::IsAligned,
|
IsAligned = Derived::IsAligned,
|
||||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||||
|
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||||
Layout = Derived::Layout,
|
Layout = Derived::Layout,
|
||||||
CoordAccess = NumCoords > 0,
|
CoordAccess = NumCoords > 0,
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef typename internal::TensorBlock<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||||
|
TensorBlock;
|
||||||
|
typedef typename internal::TensorBlockReader<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
|
||||||
|
PacketAccess>
|
||||||
|
TensorBlockReader;
|
||||||
|
|
||||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||||
const Derived& derived() const { return m_impl; }
|
const Derived& derived() const { return m_impl; }
|
||||||
|
|
||||||
@ -219,6 +255,14 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
internal::unpacket_traits<PacketReturnType>::size);
|
internal::unpacket_traits<PacketReturnType>::size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||||
|
assert(m_data != NULL);
|
||||||
|
TensorBlockReader::Run(block, m_data);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
|
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
|
||||||
|
|
||||||
/// added for sycl in order to construct the buffer from the sycl device
|
/// added for sycl in order to construct the buffer from the sycl device
|
||||||
@ -244,6 +288,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
|
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -308,7 +353,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
|||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||||
|
internal::functor_traits<UnaryOp>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -375,16 +422,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
|||||||
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
|
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
|
||||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
|
TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||||
|
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||||
|
TensorEvaluator<RightArgType, Device>::PacketAccess &
|
||||||
internal::functor_traits<BinaryOp>::PacketAccess,
|
internal::functor_traits<BinaryOp>::PacketAccess,
|
||||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||||
CoordAccess = false, // to be implemented
|
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||||
RawAccess = false
|
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||||
|
CoordAccess = false, // to be implemented
|
||||||
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_functor(op.functor()),
|
: m_device(device),
|
||||||
|
m_functor(op.functor()),
|
||||||
m_leftImpl(op.lhsExpression(), device),
|
m_leftImpl(op.lhsExpression(), device),
|
||||||
m_rightImpl(op.rhsExpression(), device)
|
m_rightImpl(op.rhsExpression(), device)
|
||||||
{
|
{
|
||||||
@ -399,6 +451,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
|||||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||||
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
||||||
|
|
||||||
|
static const int NumDims = internal::array_size<
|
||||||
|
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
|
||||||
|
|
||||||
|
typedef internal::TensorBlock<
|
||||||
|
typename internal::remove_const<Scalar>::type, Index, NumDims,
|
||||||
|
TensorEvaluator<LeftArgType, Device>::Layout>
|
||||||
|
TensorBlock;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||||
{
|
{
|
||||||
// TODO: use right impl instead if right impl dimensions are known at compile time.
|
// TODO: use right impl instead if right impl dimensions are known at compile time.
|
||||||
@ -433,6 +493,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
|||||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
m_leftImpl.getResourceRequirements(resources);
|
||||||
|
m_rightImpl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
TensorBlock* output_block) const {
|
||||||
|
if (NumDims <= 0) {
|
||||||
|
output_block->data()[0] = coeff(0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
internal::TensorBlockView<LeftArgType, Device> left_block(
|
||||||
|
m_device, m_leftImpl, *output_block);
|
||||||
|
internal::TensorBlockView<RightArgType, Device> right_block(
|
||||||
|
m_device, m_rightImpl, *output_block);
|
||||||
|
internal::TensorBlockCwiseBinaryIO<
|
||||||
|
BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
|
||||||
|
Layout>::Run(m_functor, output_block->block_sizes(),
|
||||||
|
output_block->block_strides(), output_block->data(),
|
||||||
|
left_block.block_strides(), left_block.data(),
|
||||||
|
right_block.block_strides(), right_block.data());
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
||||||
/// required by sycl in order to extract the accessor
|
/// required by sycl in order to extract the accessor
|
||||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||||
@ -442,6 +526,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
|||||||
BinaryOp functor() const { return m_functor; }
|
BinaryOp functor() const { return m_functor; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
const Device& m_device;
|
||||||
const BinaryOp m_functor;
|
const BinaryOp m_functor;
|
||||||
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
||||||
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
||||||
@ -458,6 +543,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
|||||||
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
|
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
|
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
|
||||||
internal::functor_traits<TernaryOp>::PacketAccess,
|
internal::functor_traits<TernaryOp>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
|
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -562,6 +648,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
|||||||
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
||||||
internal::packet_traits<Scalar>::HasBlend,
|
internal::packet_traits<Scalar>::HasBlend,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<IfArgType, Device>::Layout,
|
Layout = TensorEvaluator<IfArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -12,29 +12,37 @@
|
|||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
/** \class TensorExecutor
|
/**
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \class TensorExecutor
|
||||||
*
|
* \ingroup CXX11_Tensor_Module
|
||||||
* \brief The tensor executor class.
|
*
|
||||||
*
|
* \brief The tensor executor class.
|
||||||
* This class is responsible for launch the evaluation of the expression on
|
*
|
||||||
* the specified computing device.
|
* This class is responsible for launch the evaluation of the expression on
|
||||||
*/
|
* the specified computing device.
|
||||||
|
*
|
||||||
|
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
|
||||||
|
* instructions)
|
||||||
|
* @tparam Tileable can use block based tensor evaluation
|
||||||
|
* (see TensorBlock.h)
|
||||||
|
*/
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// Default strategy: the expression is evaluated with a single cpu thread.
|
/**
|
||||||
template<typename Expression, typename Device, bool Vectorizable>
|
* Default strategy: the expression is evaluated sequentially with a single cpu
|
||||||
class TensorExecutor
|
* thread, without vectorization and block evaluation.
|
||||||
{
|
*/
|
||||||
|
template <typename Expression, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable>
|
||||||
|
class TensorExecutor {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index Index;
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static inline void run(const Expression& expr, const Device& device = Device())
|
static inline void run(const Expression& expr,
|
||||||
{
|
const Device& device = Device()) {
|
||||||
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign)
|
if (needs_assign) {
|
||||||
{
|
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
for (Index i = 0; i < size; ++i) {
|
for (Index i = 0; i < size; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
@ -44,12 +52,14 @@ class TensorExecutor
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
template<typename Expression>
|
* Process all the data with a single cpu thread, using vectorized instructions.
|
||||||
class TensorExecutor<Expression, DefaultDevice, true>
|
*/
|
||||||
{
|
template <typename Expression>
|
||||||
|
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index Index;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
||||||
{
|
{
|
||||||
@ -58,9 +68,11 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
|||||||
if (needs_assign)
|
if (needs_assign)
|
||||||
{
|
{
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
const int PacketSize = unpacket_traits<typename TensorEvaluator<
|
||||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
Expression, DefaultDevice>::PacketReturnType>::size;
|
||||||
// on unrolling, because if the function is expensive the compiler should not
|
|
||||||
|
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||||
|
// on unrolling, because if the function is expensive compiler should not
|
||||||
// unroll the loop at the expense of inlining.
|
// unroll the loop at the expense of inlining.
|
||||||
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
||||||
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
||||||
@ -80,9 +92,75 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process all the data with a single cpu thread, using blocks of data. By
|
||||||
|
* sizing a block to fit L1 cache we get better cache performance.
|
||||||
|
*/
|
||||||
|
template <typename Expression, bool Vectorizable>
|
||||||
|
class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
|
||||||
|
public:
|
||||||
|
typedef typename Expression::Index Index;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
static inline void run(const Expression& expr,
|
||||||
|
const DefaultDevice& device = DefaultDevice()) {
|
||||||
|
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
|
||||||
|
|
||||||
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
|
using Index = typename traits<Expression>::Index;
|
||||||
|
const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
|
using Scalar = typename traits<Expression>::Scalar;
|
||||||
|
using ScalarNoConst = typename remove_const<Scalar>::type;
|
||||||
|
|
||||||
|
using TensorBlock =
|
||||||
|
TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
|
||||||
|
using TensorBlockMapper =
|
||||||
|
TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
|
||||||
|
|
||||||
|
Evaluator evaluator(expr, device);
|
||||||
|
std::size_t total_size = array_prod(evaluator.dimensions());
|
||||||
|
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
|
|
||||||
|
if (total_size < cache_size) {
|
||||||
|
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||||
|
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
|
||||||
|
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||||
|
false>::run(expr, device);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
|
if (needs_assign) {
|
||||||
|
// Size tensor blocks to fit in cache (or requested target block size).
|
||||||
|
size_t block_total_size = numext::mini(cache_size, total_size);
|
||||||
|
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||||
|
// Query expression tree for desired block size/shape.
|
||||||
|
std::vector<TensorOpResourceRequirements> resources;
|
||||||
|
evaluator.getResourceRequirements(&resources);
|
||||||
|
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||||
|
|
||||||
|
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
|
||||||
|
block_total_size);
|
||||||
|
block_total_size = block_mapper.block_dims_total_size();
|
||||||
|
|
||||||
|
Scalar* data = static_cast<Scalar*>(
|
||||||
|
device.allocate(block_total_size * sizeof(Scalar)));
|
||||||
|
|
||||||
|
const Index total_block_count = block_mapper.total_block_count();
|
||||||
|
for (Index i = 0; i < total_block_count; ++i) {
|
||||||
|
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
|
||||||
|
evaluator.evalBlock(&block);
|
||||||
|
}
|
||||||
|
device.deallocate(data);
|
||||||
|
}
|
||||||
|
evaluator.cleanup();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multicore strategy: the index space is partitioned and each partition is
|
||||||
|
* executed on a single core.
|
||||||
|
*/
|
||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||||
struct EvalRange {
|
struct EvalRange {
|
||||||
@ -100,7 +178,7 @@ struct EvalRange {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename Evaluator, typename Index>
|
template <typename Evaluator, typename Index>
|
||||||
struct EvalRange<Evaluator, Index, true> {
|
struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
|
||||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
|
|
||||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||||
@ -110,8 +188,8 @@ struct EvalRange<Evaluator, Index, true> {
|
|||||||
if (last - first >= PacketSize) {
|
if (last - first >= PacketSize) {
|
||||||
eigen_assert(first % PacketSize == 0);
|
eigen_assert(first % PacketSize == 0);
|
||||||
Index last_chunk_offset = last - 4 * PacketSize;
|
Index last_chunk_offset = last - 4 * PacketSize;
|
||||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||||
// on unrolling, because if the function is expensive the compiler should not
|
// on unrolling, because if the function is expensive compiler should not
|
||||||
// unroll the loop at the expense of inlining.
|
// unroll the loop at the expense of inlining.
|
||||||
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
||||||
for (Index j = 0; j < 4; j++) {
|
for (Index j = 0; j < 4; j++) {
|
||||||
@ -138,55 +216,113 @@ struct EvalRange<Evaluator, Index, true> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index Index;
|
||||||
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
|
||||||
{
|
|
||||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
|
||||||
Evaluator evaluator(expr, device);
|
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
|
||||||
if (needs_assign)
|
|
||||||
{
|
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
|
||||||
size_t num_threads = device.numThreads();
|
|
||||||
if (num_threads > 1) {
|
|
||||||
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
|
||||||
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
|
||||||
}
|
|
||||||
if (num_threads == 1) {
|
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
|
||||||
} else {
|
|
||||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
|
||||||
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
|
||||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
|
||||||
const Index numblocks = size / blocksize;
|
|
||||||
|
|
||||||
Barrier barrier(numblocks);
|
static inline void run(const Expression& expr,
|
||||||
for (int i = 0; i < numblocks; ++i) {
|
const ThreadPoolDevice& device) {
|
||||||
device.enqueue_with_barrier(
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
|
||||||
&evaluator, i * blocksize, (i + 1) * blocksize);
|
|
||||||
}
|
Evaluator evaluator(expr, device);
|
||||||
if (numblocks * blocksize < size) {
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::run(
|
if (needs_assign) {
|
||||||
&evaluator, numblocks * blocksize, size);
|
const Index PacketSize =
|
||||||
}
|
Vectorizable
|
||||||
barrier.Wait();
|
? unpacket_traits<typename Evaluator::PacketReturnType>::size
|
||||||
}
|
: 1;
|
||||||
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||||
|
EvalRange::alignBlockSize,
|
||||||
|
[&evaluator](Index first, Index last) {
|
||||||
|
EvalRange::run(&evaluator, first, last);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename Expression, bool Vectorizable>
|
||||||
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
|
||||||
|
public:
|
||||||
|
typedef typename Expression::Index Index;
|
||||||
|
|
||||||
|
static inline void run(const Expression& expr,
|
||||||
|
const ThreadPoolDevice& device) {
|
||||||
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||||
|
typedef typename internal::remove_const<
|
||||||
|
typename traits<Expression>::Scalar>::type Scalar;
|
||||||
|
typedef typename traits<Expression>::Index Index;
|
||||||
|
|
||||||
|
static const int NumDims = traits<Expression>::NumDimensions;
|
||||||
|
|
||||||
|
typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
|
||||||
|
typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
|
||||||
|
TensorBlockMapper;
|
||||||
|
|
||||||
|
Evaluator evaluator(expr, device);
|
||||||
|
std::size_t total_size = array_prod(evaluator.dimensions());
|
||||||
|
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
|
if (total_size < cache_size) {
|
||||||
|
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||||
|
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||||
|
false>::run(expr, device);
|
||||||
|
evaluator.cleanup();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
|
if (needs_assign) {
|
||||||
|
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||||
|
size_t block_total_size = 0;
|
||||||
|
// Query expression tree for desired block size/shape.
|
||||||
|
std::vector<internal::TensorOpResourceRequirements> resources;
|
||||||
|
evaluator.getResourceRequirements(&resources);
|
||||||
|
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||||
|
int num_threads = device.numThreads();
|
||||||
|
|
||||||
|
// Estimate minimum block size based on cost.
|
||||||
|
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
|
||||||
|
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
|
||||||
|
size_t block_size = static_cast<size_t>(1.0 / taskSize);
|
||||||
|
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
|
||||||
|
block_size);
|
||||||
|
block_size = block_mapper.block_dims_total_size();
|
||||||
|
const size_t aligned_blocksize =
|
||||||
|
EIGEN_MAX_ALIGN_BYTES *
|
||||||
|
divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
|
||||||
|
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||||
|
device.parallelFor(
|
||||||
|
block_mapper.total_block_count(), cost * block_size,
|
||||||
|
[=, &device, &evaluator, &block_mapper](Index first, Index last) {
|
||||||
|
// currentThreadId() returns -1 if called from a thread not in the
|
||||||
|
// threadpool, such as the main thread dispatching Eigen
|
||||||
|
// expressions.
|
||||||
|
const int thread_idx = device.currentThreadId();
|
||||||
|
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
||||||
|
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
||||||
|
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
||||||
|
for (Index i = first; i < last; ++i) {
|
||||||
|
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
||||||
|
evaluator.evalBlock(&block);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
device.deallocate(buf);
|
||||||
|
}
|
||||||
|
evaluator.cleanup();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#endif // EIGEN_USE_THREADS
|
#endif // EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
|
||||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||||
#if defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
template <typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||||
class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index Index;
|
||||||
static void run(const Expression& expr, const GpuDevice& device);
|
static void run(const Expression& expr, const GpuDevice& device);
|
||||||
@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*static*/
|
/*static*/
|
||||||
template <typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||||
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
|
||||||
const Expression& expr, const GpuDevice& device) {
|
const Expression& expr, const GpuDevice& device) {
|
||||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
|
@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
|
|||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
||||||
|
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||||
|
BlockAccess = false,
|
||||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||||
CoordAccess = true,
|
CoordAccess = true,
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
|
@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = (PacketSize > 1),
|
PacketAccess = (PacketSize > 1),
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
|
|||||||
TensorEvaluator<Expression, GpuDevice>::IsAligned;
|
TensorEvaluator<Expression, GpuDevice>::IsAligned;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename Device, typename Expression>
|
||||||
|
struct IsTileable {
|
||||||
|
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename Expression, typename Device,
|
template <typename Expression, typename Device,
|
||||||
bool Vectorizable = IsVectorizable<Device, Expression>::value>
|
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||||
|
bool Tileable = IsTileable<Device, Expression>::value>
|
||||||
class TensorExecutor;
|
class TensorExecutor;
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||||
CoordAccess = false // to be implemented
|
CoordAccess = false // to be implemented
|
||||||
};
|
};
|
||||||
|
@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
// slice offsets and sizes.
|
// slice offsets and sizes.
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
|
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
|
||||||
@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
|||||||
static const int NumDims = internal::array_size<Strides>::value;
|
static const int NumDims = internal::array_size<Strides>::value;
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
typedef typename XprType::Scalar Scalar;
|
typedef typename XprType::Scalar Scalar;
|
||||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
typedef Strides Dimensions;
|
typedef Strides Dimensions;
|
||||||
@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
|||||||
|
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
typedef typename XprType::Scalar Scalar;
|
typedef typename XprType::Scalar Scalar;
|
||||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
typedef Strides Dimensions;
|
typedef Strides Dimensions;
|
||||||
|
@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = true,
|
CoordAccess = true,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = false,
|
PacketAccess = false,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = PlainObjectType::Layout,
|
Layout = PlainObjectType::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = false,
|
PacketAccess = false,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorRef<Derived>::Layout,
|
Layout = TensorRef<Derived>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = false,
|
PacketAccess = false,
|
||||||
|
BlockAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||||
|
BlockAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
|
@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11)
|
|||||||
ei_add_test(cxx11_tensor_striding)
|
ei_add_test(cxx11_tensor_striding)
|
||||||
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
|
ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||||
ei_add_test(cxx11_tensor_ref)
|
ei_add_test(cxx11_tensor_ref)
|
||||||
ei_add_test(cxx11_tensor_random)
|
ei_add_test(cxx11_tensor_random)
|
||||||
ei_add_test(cxx11_tensor_generator)
|
ei_add_test(cxx11_tensor_generator)
|
||||||
|
@ -901,7 +901,7 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
|
|||||||
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
|
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
|
||||||
CALL_SUBTEST(NAME<RowMajor>(ARG))
|
CALL_SUBTEST(NAME<RowMajor>(ARG))
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_assign) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
|
CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
|
||||||
CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
|
CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
|
||||||
CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);
|
CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);
|
||||||
|
@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_cxx11_tensor_complex_cwise_ops()
|
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
|
||||||
{
|
{
|
||||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
|
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
|
||||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
|
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
|
||||||
|
@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_cxx11_tensor_complex()
|
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
|
||||||
{
|
{
|
||||||
CALL_SUBTEST(test_cuda_nullary());
|
CALL_SUBTEST(test_cuda_nullary());
|
||||||
CALL_SUBTEST(test_cuda_sum_reductions());
|
CALL_SUBTEST(test_cuda_sum_reductions());
|
||||||
|
81
unsupported/test/cxx11_tensor_executor.cpp
Normal file
81
unsupported/test/cxx11_tensor_executor.cpp
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
#include "main.h"
|
||||||
|
|
||||||
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using Eigen::Index;
|
||||||
|
using Eigen::Tensor;
|
||||||
|
using Eigen::RowMajor;
|
||||||
|
using Eigen::ColMajor;
|
||||||
|
|
||||||
|
// A set of tests to verify that different TensorExecutor strategies yields the
|
||||||
|
// same results for all the ops, supporting tiled execution.
|
||||||
|
|
||||||
|
template <typename Device, bool Vectorizable, bool Tileable, int Layout>
|
||||||
|
static void test_execute_binary_expr(Device d) {
|
||||||
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
||||||
|
// optimization.
|
||||||
|
Tensor<float, 3> lhs(840, 390, 37);
|
||||||
|
Tensor<float, 3> rhs(840, 390, 37);
|
||||||
|
Tensor<float, 3> dst(840, 390, 37);
|
||||||
|
|
||||||
|
lhs.setRandom();
|
||||||
|
rhs.setRandom();
|
||||||
|
|
||||||
|
const auto expr = lhs + rhs;
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (int i = 0; i < 840; ++i) {
|
||||||
|
for (int j = 0; j < 390; ++j) {
|
||||||
|
for (int k = 0; k < 37; ++k) {
|
||||||
|
float sum = lhs(i, j, k) + rhs(i, j, k);
|
||||||
|
VERIFY_IS_EQUAL(sum, dst(i, j, k));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_SUBTEST_COMBINATIONS(NAME) \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
|
||||||
|
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||||
|
Eigen::DefaultDevice default_device;
|
||||||
|
|
||||||
|
const auto num_threads = internal::random<int>(1, 24);
|
||||||
|
Eigen::ThreadPool tp(num_threads);
|
||||||
|
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CALL_SUBTEST_COMBINATIONS
|
Loading…
x
Reference in New Issue
Block a user