mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Add block access to TensorReverseOp and make sure that TensorForcedEval uses block access when preferred
This commit is contained in:
parent
16a56b2ddd
commit
878845cb25
@ -77,6 +77,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
|
|||||||
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
|
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
|
||||||
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
|
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
|
||||||
|
|
||||||
|
static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
|
||||||
: m_xpr(expr), m_buffer(buffer) {}
|
: m_xpr(expr), m_buffer(buffer) {}
|
||||||
|
|
||||||
@ -107,13 +109,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = true,
|
||||||
PreferBlockAccess = false,
|
PreferBlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef typename internal::TensorBlock<
|
||||||
|
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
||||||
|
TensorBlock;
|
||||||
|
typedef typename internal::TensorBlockReader<
|
||||||
|
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
||||||
|
TensorBlockReader;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_impl(op.expression(), device), m_device(device),
|
: m_impl(op.expression(), device), m_device(device),
|
||||||
m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
|
m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
|
||||||
@ -143,6 +152,18 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
m_impl.getResourceRequirements(resources);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
|
||||||
|
TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
|
||||||
|
block->tensor_strides(), block->tensor_strides(),
|
||||||
|
m_buffer + block->first_coeff_index());
|
||||||
|
m_impl.block(&eval_to_block);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||||
m_impl.cleanup();
|
m_impl.cleanup();
|
||||||
}
|
}
|
||||||
@ -158,6 +179,11 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||||
|
assert(m_buffer != NULL);
|
||||||
|
TensorBlockReader::Run(block, m_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
// We assume that evalPacket or evalScalar is called to perform the
|
// We assume that evalPacket or evalScalar is called to perform the
|
||||||
// assignment and account for the cost of the write here.
|
// assignment and account for the cost of the write here.
|
||||||
|
@ -346,7 +346,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
// expressions.
|
// expressions.
|
||||||
const int thread_idx = device.currentThreadId();
|
const int thread_idx = device.currentThreadId();
|
||||||
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
||||||
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
ScalarNoConst* thread_buf = reinterpret_cast<ScalarNoConst*>(
|
||||||
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
||||||
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||||
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
||||||
|
@ -126,8 +126,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
|||||||
}
|
}
|
||||||
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
||||||
EvalTo evalToTmp(m_buffer, m_op);
|
EvalTo evalToTmp(m_buffer, m_op);
|
||||||
|
|
||||||
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
|
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
|
||||||
internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device);
|
const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
|
||||||
|
TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
|
||||||
|
|
||||||
|
internal::TensorExecutor<const EvalTo,
|
||||||
|
typename internal::remove_const<Device>::type,
|
||||||
|
Vectorize, Tile>::run(evalToTmp, m_device);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||||
|
@ -113,16 +113,23 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = true,
|
||||||
PreferBlockAccess = false,
|
PreferBlockAccess = true,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = false, // to be implemented
|
CoordAccess = false, // to be implemented
|
||||||
RawAccess = false
|
RawAccess = false,
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||||
|
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||||
|
OutputTensorBlock;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||||
const Device& device)
|
const Device& device)
|
||||||
: m_impl(op.expression(), device), m_reverse(op.reverse())
|
: m_impl(op.expression(), device),
|
||||||
|
m_reverse(op.reverse()),
|
||||||
|
m_device(device)
|
||||||
{
|
{
|
||||||
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
|
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
|
||||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
@ -140,6 +147,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
|
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Remember the strides for fast division.
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
@ -159,7 +170,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
Index idx = index / m_strides[i];
|
Index idx = index / m_fastStrides[i];
|
||||||
index -= idx * m_strides[i];
|
index -= idx * m_strides[i];
|
||||||
if (m_reverse[i]) {
|
if (m_reverse[i]) {
|
||||||
idx = m_dimensions[i] - idx - 1;
|
idx = m_dimensions[i] - idx - 1;
|
||||||
@ -173,7 +184,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
Index idx = index / m_strides[i];
|
Index idx = index / m_fastStrides[i];
|
||||||
index -= idx * m_strides[i];
|
index -= idx * m_strides[i];
|
||||||
if (m_reverse[i]) {
|
if (m_reverse[i]) {
|
||||||
idx = m_dimensions[i] - idx - 1;
|
idx = m_dimensions[i] - idx - 1;
|
||||||
@ -212,6 +223,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
return rslt;
|
return rslt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||||
|
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||||
|
Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
|
||||||
|
1, m_device.lastLevelCacheSize() / sizeof(Scalar));
|
||||||
|
resources->push_back(internal::TensorOpResourceRequirements(
|
||||||
|
internal::kSkewedInnerDims, block_total_size_max));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BlockIteratorState {
|
||||||
|
Index block_size;
|
||||||
|
Index block_stride;
|
||||||
|
Index block_span;
|
||||||
|
Index input_size;
|
||||||
|
Index input_stride;
|
||||||
|
Index input_span;
|
||||||
|
Index count;
|
||||||
|
bool reverse;
|
||||||
|
};
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||||
|
OutputTensorBlock* output_block) const {
|
||||||
|
if (NumDims <= 0) return;
|
||||||
|
|
||||||
|
// TODO(ezhulenev): If underlying tensor expression supports and prefers
|
||||||
|
// block evaluation we must use it. Currently we use coeff and packet
|
||||||
|
// access into the underlying tensor expression.
|
||||||
|
// static const bool useBlockAccessForArgType =
|
||||||
|
// TensorEvaluator<ArgType, Device>::BlockAccess &&
|
||||||
|
// TensorEvaluator<ArgType, Device>::PreferBlockAccess;
|
||||||
|
|
||||||
|
static const bool isColMajor =
|
||||||
|
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||||
|
|
||||||
|
static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
|
||||||
|
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
|
||||||
|
|
||||||
|
CoeffReturnType* data = output_block->data();
|
||||||
|
Index block_offset = 0;
|
||||||
|
|
||||||
|
Index input_offset = reverseIndex(output_block->first_coeff_index());
|
||||||
|
|
||||||
|
// Initialize output block iterator state. Dimension in this array are
|
||||||
|
// always in inner_most -> outer_most order (col major layout).
|
||||||
|
array<BlockIteratorState, NumDims> it;
|
||||||
|
for (Index i = 0; i < NumDims; ++i) {
|
||||||
|
const Index dim = isColMajor ? i : NumDims - 1 - i;
|
||||||
|
it[i].block_size = output_block->block_sizes()[dim];
|
||||||
|
it[i].block_stride = output_block->block_strides()[dim];
|
||||||
|
it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
|
||||||
|
it[i].input_size = m_dimensions[dim];
|
||||||
|
it[i].input_stride = m_strides[dim];
|
||||||
|
it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
|
||||||
|
it[i].count = 0;
|
||||||
|
it[i].reverse = m_reverse[dim];
|
||||||
|
|
||||||
|
if (it[i].reverse) {
|
||||||
|
it[i].input_stride = -1 * it[i].input_stride;
|
||||||
|
it[i].input_span = -1 * it[i].input_span;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If multiple inner dimensions have the same reverse flag, check if we can
|
||||||
|
// merge them into a single virtual inner dimension.
|
||||||
|
int effective_inner_dim = 0;
|
||||||
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
|
if (it[i].reverse != it[effective_inner_dim].reverse) break;
|
||||||
|
if (it[i].block_stride != it[effective_inner_dim].input_size) break;
|
||||||
|
if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
|
||||||
|
|
||||||
|
it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size;
|
||||||
|
it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size;
|
||||||
|
|
||||||
|
it[i].block_stride = 1;
|
||||||
|
it[i].input_stride = (inner_dim_reversed ? -1 : 1);
|
||||||
|
|
||||||
|
it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
|
||||||
|
it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
|
||||||
|
|
||||||
|
effective_inner_dim = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
eigen_assert(it[effective_inner_dim].block_stride == 1);
|
||||||
|
eigen_assert(it[effective_inner_dim].input_stride ==
|
||||||
|
(inner_dim_reversed ? -1 : 1));
|
||||||
|
|
||||||
|
const Index inner_dim_size = it[effective_inner_dim].block_size;
|
||||||
|
|
||||||
|
while (it[NumDims - 1].count < it[NumDims - 1].block_size) {
|
||||||
|
// Copy inner-most dimension data from reversed location in input.
|
||||||
|
Index dst = block_offset;
|
||||||
|
Index src = input_offset;
|
||||||
|
|
||||||
|
// NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
|
||||||
|
// worse results in benchmarks than a simple coefficient loop.
|
||||||
|
if (inner_dim_reversed) {
|
||||||
|
for (Index i = 0; i < inner_dim_size; ++i) {
|
||||||
|
data[dst] = m_impl.coeff(src);
|
||||||
|
++dst;
|
||||||
|
--src;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (Index i = 0; i < inner_dim_size; ++i) {
|
||||||
|
data[dst] = m_impl.coeff(src);
|
||||||
|
++dst;
|
||||||
|
++src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the 1d tensor we need to generate only one inner-most dimension.
|
||||||
|
if ((NumDims - effective_inner_dim) == 1) break;
|
||||||
|
|
||||||
|
// Update offset.
|
||||||
|
for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
|
||||||
|
if (++it[i].count < it[i].block_size) {
|
||||||
|
block_offset += it[i].block_stride;
|
||||||
|
input_offset += it[i].input_stride;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (i != NumDims - 1) it[i].count = 0;
|
||||||
|
block_offset -= it[i].block_span;
|
||||||
|
input_offset -= it[i].input_span;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||||
2 * TensorOpCost::MulCost<Index>() +
|
2 * TensorOpCost::MulCost<Index>() +
|
||||||
@ -235,8 +371,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
|||||||
protected:
|
protected:
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
array<Index, NumDims> m_strides;
|
array<Index, NumDims> m_strides;
|
||||||
|
array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
ReverseDimensions m_reverse;
|
ReverseDimensions m_reverse;
|
||||||
|
const Device& m_device;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Eval as lvalue
|
// Eval as lvalue
|
||||||
|
@ -270,6 +270,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
input_block_strides[i + 1] * input_block_sizes[i + 1];
|
input_block_strides[i + 1] * input_block_sizes[i + 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
fast_input_block_strides[i] =
|
||||||
|
internal::TensorIntDivisor<Index>(input_block_strides[i]);
|
||||||
|
}
|
||||||
|
|
||||||
// Read input block.
|
// Read input block.
|
||||||
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
|
||||||
@ -293,8 +298,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
|
Index output_index =
|
||||||
output_block_strides);
|
GetBlockOutputIndex(input_index, input_block_strides,
|
||||||
|
output_block_strides, fast_input_block_strides);
|
||||||
if (output_index == input_index) {
|
if (output_index == input_index) {
|
||||||
// Coefficient already in place.
|
// Coefficient already in place.
|
||||||
bitmap[output_index] = true;
|
bitmap[output_index] = true;
|
||||||
@ -312,8 +318,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
data[output_index] = shuffled_value;
|
data[output_index] = shuffled_value;
|
||||||
shuffled_value = evicted_value;
|
shuffled_value = evicted_value;
|
||||||
bitmap[output_index] = true;
|
bitmap[output_index] = true;
|
||||||
output_index = GetBlockOutputIndex(output_index, input_block_strides,
|
output_index =
|
||||||
output_block_strides);
|
GetBlockOutputIndex(output_index, input_block_strides,
|
||||||
|
output_block_strides, fast_input_block_strides);
|
||||||
} while (output_index != input_index);
|
} while (output_index != input_index);
|
||||||
|
|
||||||
data[output_index] = shuffled_value;
|
data[output_index] = shuffled_value;
|
||||||
@ -341,11 +348,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
|
||||||
Index input_index,
|
Index input_index,
|
||||||
const DSizes<Index, NumDims>& input_block_strides,
|
const DSizes<Index, NumDims>& input_block_strides,
|
||||||
const DSizes<Index, NumDims>& output_block_strides) const {
|
const DSizes<Index, NumDims>& output_block_strides,
|
||||||
|
const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
|
||||||
Index output_index = 0;
|
Index output_index = 0;
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = input_index / input_block_strides[i];
|
const Index idx = input_index / fast_input_block_strides[i];
|
||||||
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
||||||
input_index -= idx * input_block_strides[i];
|
input_index -= idx * input_block_strides[i];
|
||||||
}
|
}
|
||||||
@ -353,7 +361,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
output_block_strides[m_inverseShuffle[0]];
|
output_block_strides[m_inverseShuffle[0]];
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = input_index / input_block_strides[i];
|
const Index idx = input_index / fast_input_block_strides[i];
|
||||||
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
output_index += idx * output_block_strides[m_inverseShuffle[i]];
|
||||||
input_index -= idx * input_block_strides[i];
|
input_index -= idx * input_block_strides[i];
|
||||||
}
|
}
|
||||||
|
@ -527,6 +527,41 @@ static void test_execute_generator_op(Device d)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
||||||
|
bool Tileable, int Layout>
|
||||||
|
static void test_execute_reverse_rvalue(Device d)
|
||||||
|
{
|
||||||
|
static constexpr int Options = 0 | Layout;
|
||||||
|
|
||||||
|
auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
|
||||||
|
Tensor <T, NumDims, Options, Index> src(dims);
|
||||||
|
src.setRandom();
|
||||||
|
|
||||||
|
// Reverse half of the dimensions.
|
||||||
|
Eigen::array<bool, NumDims> reverse;
|
||||||
|
for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0);
|
||||||
|
|
||||||
|
const auto expr = src.reverse(reverse);
|
||||||
|
|
||||||
|
// We assume that reversing on a default device is tested and correct, so
|
||||||
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
||||||
|
Tensor <T, NumDims, Options, Index> golden;
|
||||||
|
golden = expr;
|
||||||
|
|
||||||
|
// Now do the reversing using configured tensor executor.
|
||||||
|
Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
|
||||||
|
|
||||||
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||||
|
using Executor =
|
||||||
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||||
|
|
||||||
|
Executor::run(Assign(dst, expr), d);
|
||||||
|
|
||||||
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
||||||
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define CALL_SUBTEST_PART(PART) \
|
#define CALL_SUBTEST_PART(PART) \
|
||||||
CALL_SUBTEST_##PART
|
CALL_SUBTEST_##PART
|
||||||
|
|
||||||
@ -613,8 +648,14 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
|
||||||
|
|
||||||
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
|
||||||
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
|
||||||
|
|
||||||
// Force CMake to split this test.
|
// Force CMake to split this test.
|
||||||
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13
|
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CALL_SUBTEST_COMBINATIONS
|
#undef CALL_SUBTEST_COMBINATIONS
|
||||||
|
Loading…
x
Reference in New Issue
Block a user