Add tiled evaluation support to TensorExecutor

This commit is contained in:
Eugene Zhulenev 2018-07-25 13:51:10 -07:00
parent d55efa6f0f
commit 6913221c43
33 changed files with 598 additions and 93 deletions

View File

@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h" #include "src/Tensor/TensorBase.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContractionBlocking.h"

View File

@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
static const int NumDims = XprType::NumDims;
enum { enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, TensorEvaluator<RightArgType, Device>::IsAligned,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
TensorBlock;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device), m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device) m_rightImpl(op.rhsExpression(), device)
{ {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(
(static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
} }
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {
m_leftImpl.getResourceRequirements(resources);
m_rightImpl.getResourceRequirements(resources);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
m_leftImpl.data() != nullptr) {
TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
block->tensor_strides(), block->tensor_strides(),
m_leftImpl.data() + block->first_coeff_index());
m_rightImpl.block(&left_block);
} else {
m_rightImpl.block(block);
m_leftImpl.writeBlock(*block);
}
}
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor

View File

@ -65,6 +65,40 @@ enum class TensorBlockShapeType {
kSkewedInnerDims, kSkewedInnerDims,
}; };
struct TensorOpResourceRequirements {
TensorBlockShapeType block_shape;
std::size_t block_total_size;
// TODO(andydavis) Add 'target_num_threads' to support communication of
// thread-resource requirements. This will allow ops deep in the
// expression tree (like reductions) to communicate resources
// requirements based on local state (like the total number of reductions
// to be computed).
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
const std::size_t size)
: block_shape(shape), block_total_size(size) {}
};
// Tries to merge multiple resource requirements.
EIGEN_STRONG_INLINE void MergeResourceRequirements(
const std::vector<TensorOpResourceRequirements>& resources,
TensorBlockShapeType* block_shape, std::size_t* block_total_size) {
if (resources.empty()) {
return;
}
// TODO(andydavis) Implement different policies (i.e. revert to a default
// policy if block shapes/sizes conflict).
*block_shape = resources[0].block_shape;
*block_total_size = resources[0].block_total_size;
for (int i = 1; i < resources.size(); ++i) {
if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
*block_shape != TensorBlockShapeType::kSkewedInnerDims) {
*block_shape = TensorBlockShapeType::kSkewedInnerDims;
}
*block_total_size =
numext::maxi(*block_total_size, resources[i].block_total_size);
}
}
/** /**
* \class TensorBlock * \class TensorBlock
* \ingroup CXX11_Tensor_Module * \ingroup CXX11_Tensor_Module
@ -74,7 +108,7 @@ enum class TensorBlockShapeType {
* This class represents a tensor block specified by the index of the * This class represents a tensor block specified by the index of the
* first block coefficient, and the size of the block in each dimension. * first block coefficient, and the size of the block in each dimension.
*/ */
template <typename Scalar, typename Index, std::size_t NumDims, int Layout> template <typename Scalar, typename Index, int NumDims, int Layout>
class TensorBlock { class TensorBlock {
public: public:
typedef DSizes<Index, NumDims> Dimensions; typedef DSizes<Index, NumDims> Dimensions;
@ -614,6 +648,83 @@ struct TensorBlockCwiseBinaryIO {
} }
}; };
/**
* \class TensorBlockView
* \ingroup CXX11_Tensor_Module
*
* \brief Read-only view into a block of data.
*
* This class provides read-only access to a block of data in impl. It may need
* to allocate space for holding the intermediate result.
*
*/
template <class ArgType, class Device>
struct TensorBlockView {
typedef TensorEvaluator<ArgType, Device> Impl;
typedef typename Impl::Index Index;
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
static const int NumDims = array_size<typename Impl::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
// Constructs a TensorBlockView for `impl`. `block` is only used for for
// specifying the start offset, shape, and strides of the block.
template <typename OtherTensorBlock>
TensorBlockView(const Device& device,
const TensorEvaluator<ArgType, Device>& impl,
const OtherTensorBlock& block)
: m_device(device),
m_block_sizes(block.block_sizes()),
m_data(NULL),
m_allocated_data(NULL) {
if (Impl::RawAccess && impl.data() != NULL) {
m_data = impl.data() + block.first_coeff_index();
m_block_strides = block.tensor_strides();
} else {
// Actually make a copy.
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
// Consider allowing ops to request additional temporary block memory in
// TensorOpResourceRequirements.
m_allocated_data = static_cast<Scalar*>(
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
m_data = m_allocated_data;
if (NumDims > 0) {
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
m_block_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
}
} else {
m_block_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
}
}
}
TensorBlock<Scalar, Index, NumDims, Impl::Layout> input_block(
block.first_coeff_index(), m_block_sizes, m_block_strides,
block.tensor_strides(), m_allocated_data);
impl.block(&input_block);
}
}
~TensorBlockView() {
if (m_allocated_data != NULL) {
m_device.deallocate(m_allocated_data);
}
}
const Dimensions& block_sizes() const { return m_block_sizes; }
const Dimensions& block_strides() const { return m_block_strides; }
const Scalar* data() const { return m_data; }
private:
const Device& m_device;
Dimensions m_block_sizes, m_block_strides;
const Scalar* m_data; // Not owned.
Scalar* m_allocated_data; // Owned.
};
/** /**
* \class TensorBlockMapper * \class TensorBlockMapper
* \ingroup CXX11_Tensor_Module * \ingroup CXX11_Tensor_Module

View File

@ -1,4 +1,5 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
@ -110,6 +111,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
// slice offsets. // slice offsets.
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };
@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = true RawAccess = true

View File

@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = true, PacketAccess = true,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false RawAccess = false
}; };

View File

@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout, Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D(
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) { const int maxX, const int kernelSize, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D(
const int maxX, const int numY, const int maxY, const int kernelSizeX, const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) { const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) { const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC) #if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s) HIP_DYNAMIC_SHARED(float, s)
#else #else
extern __shared__ float s[]; extern __shared__ float s[];
#endif #endif
@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum { enum {
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
} }
} }
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#else
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#endif
#if EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {

View File

@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = true RawAccess = true

View File

@ -41,11 +41,24 @@ struct TensorEvaluator
enum { enum {
IsAligned = Derived::IsAligned, IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout, Layout = Derived::Layout,
CoordAccess = NumCoords > 0, CoordAccess = NumCoords > 0,
RawAccess = true RawAccess = true
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlock;
typedef typename internal::TensorBlockReader<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
PacketAccess>
TensorBlockReader;
typedef typename internal::TensorBlockWriter<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
PacketAccess>
TensorBlockWriter;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
{ } { }
@ -113,6 +126,20 @@ struct TensorEvaluator
internal::unpacket_traits<PacketReturnType>::size); internal::unpacket_traits<PacketReturnType>::size);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
assert(m_data != NULL);
TensorBlockReader::Run(block, m_data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
const TensorBlock& block) {
assert(m_data != NULL);
TensorBlockWriter::Run(block, m_data);
}
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; } EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
/// required by sycl in order to construct sycl buffer from raw pointer /// required by sycl in order to construct sycl buffer from raw pointer
@ -167,11 +194,20 @@ struct TensorEvaluator<const Derived, Device>
enum { enum {
IsAligned = Derived::IsAligned, IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout, Layout = Derived::Layout,
CoordAccess = NumCoords > 0, CoordAccess = NumCoords > 0,
RawAccess = true RawAccess = true
}; };
typedef typename internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlock;
typedef typename internal::TensorBlockReader<
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
PacketAccess>
TensorBlockReader;
// Used for accessor extraction in SYCL Managed TensorMap: // Used for accessor extraction in SYCL Managed TensorMap:
const Derived& derived() const { return m_impl; } const Derived& derived() const { return m_impl; }
@ -219,6 +255,14 @@ struct TensorEvaluator<const Derived, Device>
internal::unpacket_traits<PacketReturnType>::size); internal::unpacket_traits<PacketReturnType>::size);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
assert(m_data != NULL);
TensorBlockReader::Run(block, m_data);
}
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; } EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
/// added for sycl in order to construct the buffer from the sycl device /// added for sycl in order to construct the buffer from the sycl device
@ -244,6 +288,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -308,7 +353,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
internal::functor_traits<UnaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -375,16 +422,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum { enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
TensorEvaluator<RightArgType, Device>::PacketAccess &
internal::functor_traits<BinaryOp>::PacketAccess, internal::functor_traits<BinaryOp>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout, BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
CoordAccess = false, // to be implemented TensorEvaluator<RightArgType, Device>::BlockAccess,
RawAccess = false Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
}; };
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), : m_device(device),
m_functor(op.functor()),
m_leftImpl(op.lhsExpression(), device), m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device) m_rightImpl(op.rhsExpression(), device)
{ {
@ -399,6 +451,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
static const int NumDims = internal::array_size<
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
typedef internal::TensorBlock<
typename internal::remove_const<Scalar>::type, Index, NumDims,
TensorEvaluator<LeftArgType, Device>::Layout>
TensorBlock;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{ {
// TODO: use right impl instead if right impl dimensions are known at compile time. // TODO: use right impl instead if right impl dimensions are known at compile time.
@ -433,6 +493,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const {
m_leftImpl.getResourceRequirements(resources);
m_rightImpl.getResourceRequirements(resources);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
TensorBlock* output_block) const {
if (NumDims <= 0) {
output_block->data()[0] = coeff(0);
return;
}
internal::TensorBlockView<LeftArgType, Device> left_block(
m_device, m_leftImpl, *output_block);
internal::TensorBlockView<RightArgType, Device> right_block(
m_device, m_rightImpl, *output_block);
internal::TensorBlockCwiseBinaryIO<
BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
Layout>::Run(m_functor, output_block->block_sizes(),
output_block->block_strides(), output_block->data(),
left_block.block_strides(), left_block.data(),
right_block.block_strides(), right_block.data());
}
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor /// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
@ -442,6 +526,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
BinaryOp functor() const { return m_functor; } BinaryOp functor() const { return m_functor; }
private: private:
const Device& m_device;
const BinaryOp m_functor; const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl; TensorEvaluator<RightArgType, Device> m_rightImpl;
@ -458,6 +543,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess, internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout, Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -562,6 +648,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
internal::packet_traits<Scalar>::HasBlend, internal::packet_traits<Scalar>::HasBlend,
BlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout, Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -12,29 +12,37 @@
namespace Eigen { namespace Eigen {
/** \class TensorExecutor /**
* \ingroup CXX11_Tensor_Module * \class TensorExecutor
* * \ingroup CXX11_Tensor_Module
* \brief The tensor executor class. *
* * \brief The tensor executor class.
* This class is responsible for launch the evaluation of the expression on *
* the specified computing device. * This class is responsible for launch the evaluation of the expression on
*/ * the specified computing device.
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
* @tparam Tileable can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal { namespace internal {
// Default strategy: the expression is evaluated with a single cpu thread. /**
template<typename Expression, typename Device, bool Vectorizable> * Default strategy: the expression is evaluated sequentially with a single cpu
class TensorExecutor * thread, without vectorization and block evaluation.
{ */
template <typename Expression, typename Device, bool Vectorizable,
bool Tileable>
class TensorExecutor {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const Device& device = Device()) static inline void run(const Expression& expr,
{ const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device); TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) if (needs_assign) {
{
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
for (Index i = 0; i < size; ++i) { for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
@ -44,12 +52,14 @@ class TensorExecutor
} }
}; };
/**
template<typename Expression> * Process all the data with a single cpu thread, using vectorized instructions.
class TensorExecutor<Expression, DefaultDevice, true> */
{ template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{ {
@ -58,9 +68,11 @@ class TensorExecutor<Expression, DefaultDevice, true>
if (needs_assign) if (needs_assign)
{ {
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; const int PacketSize = unpacket_traits<typename TensorEvaluator<
// Give the compiler a strong hint to unroll the loop. But don't insist Expression, DefaultDevice>::PacketReturnType>::size;
// on unrolling, because if the function is expensive the compiler should not
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining. // unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
@ -80,9 +92,75 @@ class TensorExecutor<Expression, DefaultDevice, true>
} }
}; };
/**
* Process all the data with a single cpu thread, using blocks of data. By
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
// Multicore strategy: the index space is partitioned and each partition is executed on a single core using Index = typename traits<Expression>::Index;
const int NumDims = traits<Expression>::NumDimensions;
using Scalar = typename traits<Expression>::Scalar;
using ScalarNoConst = typename remove_const<Scalar>::type;
using TensorBlock =
TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
using TensorBlockMapper =
TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
Evaluator evaluator(expr, device);
std::size_t total_size = array_prod(evaluator.dimensions());
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
false>::run(expr, device);
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Size tensor blocks to fit in cache (or requested target block size).
size_t block_total_size = numext::mini(cache_size, total_size);
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
// Query expression tree for desired block size/shape.
std::vector<TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
block_total_size);
block_total_size = block_mapper.block_dims_total_size();
Scalar* data = static_cast<Scalar*>(
device.allocate(block_total_size * sizeof(Scalar)));
const Index total_block_count = block_mapper.total_block_count();
for (Index i = 0; i < total_block_count; ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
evaluator.evalBlock(&block);
}
device.deallocate(data);
}
evaluator.cleanup();
}
};
/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*/
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange { struct EvalRange {
@ -100,7 +178,7 @@ struct EvalRange {
}; };
template <typename Evaluator, typename Index> template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> { struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(Evaluator* evaluator_in, const Index first, const Index last) { static void run(Evaluator* evaluator_in, const Index first, const Index last) {
@ -110,8 +188,8 @@ struct EvalRange<Evaluator, Index, true> {
if (last - first >= PacketSize) { if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0); eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize; Index last_chunk_offset = last - 4 * PacketSize;
// Give the compiler a strong hint to unroll the loop. But don't insist // Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive the compiler should not // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining. // unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) { for (; i <= last_chunk_offset; i += 4*PacketSize) {
for (Index j = 0; j < 4; j++) { for (Index j = 0; j < 4; j++) {
@ -138,55 +216,113 @@ struct EvalRange<Evaluator, Index, true> {
} }
}; };
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
size_t num_threads = device.numThreads();
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
Barrier barrier(numblocks); static inline void run(const Expression& expr,
for (int i = 0; i < numblocks; ++i) { const ThreadPoolDevice& device) {
device.enqueue_with_barrier( typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
&evaluator, i * blocksize, (i + 1) * blocksize);
} Evaluator evaluator(expr, device);
if (numblocks * blocksize < size) { const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
EvalRange<Evaluator, Index, Vectorizable>::run( if (needs_assign) {
&evaluator, numblocks * blocksize, size); const Index PacketSize =
} Vectorizable
barrier.Wait(); ? unpacket_traits<typename Evaluator::PacketReturnType>::size
} : 1;
const Index size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[&evaluator](Index first, Index last) {
EvalRange::run(&evaluator, first, last);
});
} }
evaluator.cleanup(); evaluator.cleanup();
} }
}; };
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr,
const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef typename internal::remove_const<
typename traits<Expression>::Scalar>::type Scalar;
typedef typename traits<Expression>::Index Index;
static const int NumDims = traits<Expression>::NumDimensions;
typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
TensorBlockMapper;
Evaluator evaluator(expr, device);
std::size_t total_size = array_prod(evaluator.dimensions());
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
false>::run(expr, device);
evaluator.cleanup();
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
size_t block_total_size = 0;
// Query expression tree for desired block size/shape.
std::vector<internal::TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
int num_threads = device.numThreads();
// Estimate minimum block size based on cost.
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
size_t block_size = static_cast<size_t>(1.0 / taskSize);
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
block_size);
block_size = block_mapper.block_dims_total_size();
const size_t aligned_blocksize =
EIGEN_MAX_ALIGN_BYTES *
divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
device.parallelFor(
block_mapper.total_block_count(), cost * block_size,
[=, &device, &evaluator, &block_mapper](Index first, Index last) {
// currentThreadId() returns -1 if called from a thread not in the
// threadpool, such as the main thread dispatching Eigen
// expressions.
const int thread_idx = device.currentThreadId();
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
Scalar* thread_buf = reinterpret_cast<Scalar*>(
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
for (Index i = first; i < last; ++i) {
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
evaluator.evalBlock(&block);
}
});
device.deallocate(buf);
}
evaluator.cleanup();
}
};
#endif // EIGEN_USE_THREADS #endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU. // GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU) #if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable> { class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static void run(const Expression& expr, const GpuDevice& device); static void run(const Expression& expr, const GpuDevice& device);
@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) {
} }
/*static*/ /*static*/
template <typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable, bool Tileable>
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const Expression& expr, const GpuDevice& device) { const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);

View File

@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
enum { enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor, Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true, CoordAccess = true,
RawAccess = true RawAccess = true

View File

@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = (PacketSize > 1), PacketAccess = (PacketSize > 1),
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true RawAccess = true
}; };

View File

@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
TensorEvaluator<Expression, GpuDevice>::IsAligned; TensorEvaluator<Expression, GpuDevice>::IsAligned;
}; };
template <typename Device, typename Expression>
struct IsTileable {
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
};
template <typename Expression, typename Device, template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value> bool Vectorizable = IsVectorizable<Device, Expression>::value,
bool Tileable = IsTileable<Device, Expression>::value>
class TensorExecutor; class TensorExecutor;
} // end namespace internal } // end namespace internal

View File

@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false

View File

@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented CoordAccess = false // to be implemented
}; };

View File

@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
enum { enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
// slice offsets and sizes. // slice offsets and sizes.
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false
@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
static const int NumDims = internal::array_size<Strides>::value; static const int NumDims = internal::array_size<Strides>::value;
typedef typename XprType::Index Index; typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions; typedef Strides Dimensions;
@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
} }
return inputIndex; return inputIndex;
} }
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__ #ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value)); return numext::maxi(min, numext::mini(max,value));
@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
typedef typename XprType::Index Index; typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions; typedef Strides Dimensions;

View File

@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
enum { enum {
IsAligned = true, IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true, CoordAccess = true,
RawAccess = false RawAccess = false

View File

@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false

View File

@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = Self::InputPacketAccess && Op::PacketAccess, PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = PlainObjectType::Layout, Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
Layout = TensorRef<Derived>::Layout, Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = false, PacketAccess = false,
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1), PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1), PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
RawAccess = false RawAccess = false
}; };

View File

@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false
@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
enum { enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented CoordAccess = false, // to be implemented
RawAccess = false RawAccess = false

View File

@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
enum { enum {
IsAligned = false, IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, CoordAccess = false,
RawAccess = false RawAccess = false
@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
for (int i = 0; i < NumInputDims; ++i) { for (int i = 0; i < NumInputDims; ++i) {
m_reduced[i] = false; m_reduced[i] = false;
} }
const Dims& op_dims = op.dims(); const Dims& op_dims = op.dims();
for (int i = 0; i < NumReducedDims; ++i) { for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op_dims[i] >= 0); eigen_assert(op_dims[i] >= 0);
@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
eigen_assert(num_distinct_reduce_dims == NumReducedDims); eigen_assert(num_distinct_reduce_dims == NumReducedDims);
// Compute the dimensions of the result. // Compute the dimensions of the result.
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int output_index = 0; int output_index = 0;
@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
result += m_impl.coeff(cur_index); result += m_impl.coeff(cur_index);
cur_index += index_stride; cur_index += index_stride;
} }
return result; return result;
} }

View File

@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_striding) ei_add_test(cxx11_tensor_striding)
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_ref)
ei_add_test(cxx11_tensor_random) ei_add_test(cxx11_tensor_random)
ei_add_test(cxx11_tensor_generator) ei_add_test(cxx11_tensor_generator)

View File

@ -901,7 +901,7 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
CALL_SUBTEST(NAME<ColMajor>(ARG)); \ CALL_SUBTEST(NAME<ColMajor>(ARG)); \
CALL_SUBTEST(NAME<RowMajor>(ARG)) CALL_SUBTEST(NAME<RowMajor>(ARG))
EIGEN_DECLARE_TEST(cxx11_tensor_assign) { EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity); CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element); CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element); CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);

View File

@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
} }
void test_cxx11_tensor_complex_cwise_ops() EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
{ {
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());

View File

@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
} }
void test_cxx11_tensor_complex() EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
{ {
CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_nullary());
CALL_SUBTEST(test_cuda_sum_reductions()); CALL_SUBTEST(test_cuda_sum_reductions());

View File

@ -0,0 +1,81 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_USE_THREADS
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Index;
using Eigen::Tensor;
using Eigen::RowMajor;
using Eigen::ColMajor;
// A set of tests to verify that different TensorExecutor strategies yields the
// same results for all the ops, supporting tiled execution.
template <typename Device, bool Vectorizable, bool Tileable, int Layout>
static void test_execute_binary_expr(Device d) {
// Pick a large enough tensor size to bypass small tensor block evaluation
// optimization.
Tensor<float, 3> lhs(840, 390, 37);
Tensor<float, 3> rhs(840, 390, 37);
Tensor<float, 3> dst(840, 390, 37);
lhs.setRandom();
rhs.setRandom();
const auto expr = lhs + rhs;
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using Executor =
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
Executor::run(Assign(dst, expr), d);
for (int i = 0; i < 840; ++i) {
for (int j = 0; j < 390; ++j) {
for (int k = 0; k < 37; ++k) {
float sum = lhs(i, j, k) + rhs(i, j, k);
VERIFY_IS_EQUAL(sum, dst(i, j, k));
}
}
}
}
#define CALL_SUBTEST_COMBINATIONS(NAME) \
CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
const auto num_threads = internal::random<int>(1, 24);
Eigen::ThreadPool tp(num_threads);
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
}
#undef CALL_SUBTEST_COMBINATIONS