refactor AssignmentEvaluator

This commit is contained in:
Charles Schlosser 2025-02-15 00:39:41 +00:00 committed by Rasmus Munk Larsen
parent 9c211430b5
commit eb3f9f443d

View File

@ -29,123 +29,109 @@ namespace internal {
template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1> template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
struct copy_using_evaluator_traits { struct copy_using_evaluator_traits {
typedef typename DstEvaluator::XprType Dst; using Src = typename SrcEvaluator::XprType;
typedef typename Dst::Scalar DstScalar; using Dst = typename DstEvaluator::XprType;
using DstScalar = typename Dst::Scalar;
enum { DstFlags = DstEvaluator::Flags, SrcFlags = SrcEvaluator::Flags }; static constexpr int DstFlags = DstEvaluator::Flags;
static constexpr int SrcFlags = SrcEvaluator::Flags;
public: public:
enum { static constexpr int DstAlignment = DstEvaluator::Alignment;
DstAlignment = DstEvaluator::Alignment, static constexpr int SrcAlignment = SrcEvaluator::Alignment;
SrcAlignment = SrcEvaluator::Alignment, static constexpr int JointAlignment = plain_enum_min(DstAlignment, SrcAlignment);
DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, static constexpr bool DstHasDirectAccess = bool(DstFlags & DirectAccessBit);
JointAlignment = plain_enum_min(DstAlignment, SrcAlignment) static constexpr bool SrcIsRowMajor = bool(SrcFlags & RowMajorBit);
}; static constexpr bool DstIsRowMajor = bool(DstFlags & RowMajorBit);
static constexpr bool IsVectorAtCompileTime = Dst::IsVectorAtCompileTime;
private: static constexpr int RowsAtCompileTime = Dst::RowsAtCompileTime;
enum { static constexpr int ColsAtCompileTime = Dst::ColsAtCompileTime;
InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime) static constexpr int SizeAtCompileTime = Dst::SizeAtCompileTime;
: int(DstFlags) & RowMajorBit ? int(Dst::ColsAtCompileTime) static constexpr int MaxRowsAtCompileTime = Dst::MaxRowsAtCompileTime;
: int(Dst::RowsAtCompileTime), static constexpr int MaxColsAtCompileTime = Dst::MaxColsAtCompileTime;
InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) static constexpr int MaxSizeAtCompileTime = Dst::MaxSizeAtCompileTime;
: int(DstFlags) & RowMajorBit ? int(Dst::MaxColsAtCompileTime) static constexpr int InnerSizeAtCompileTime = IsVectorAtCompileTime ? SizeAtCompileTime
: int(Dst::MaxRowsAtCompileTime), : DstIsRowMajor ? ColsAtCompileTime
RestrictedInnerSize = min_size_prefer_fixed(InnerSize, MaxPacketSize), : RowsAtCompileTime;
RestrictedLinearSize = min_size_prefer_fixed(Dst::SizeAtCompileTime, MaxPacketSize), static constexpr int MaxInnerSizeAtCompileTime = IsVectorAtCompileTime ? MaxSizeAtCompileTime
OuterStride = int(outer_stride_at_compile_time<Dst>::ret), : DstIsRowMajor ? MaxColsAtCompileTime
MaxSizeAtCompileTime = Dst::SizeAtCompileTime : MaxRowsAtCompileTime;
}; static constexpr int RestrictedInnerSize = min_size_prefer_fixed(InnerSizeAtCompileTime, MaxPacketSize);
static constexpr int RestrictedLinearSize = min_size_prefer_fixed(SizeAtCompileTime, MaxPacketSize);
static constexpr int OuterStride = outer_stride_at_compile_time<Dst>::ret;
// TODO distinguish between linear traversal and inner-traversals // TODO distinguish between linear traversal and inner-traversals
typedef typename find_best_packet<DstScalar, RestrictedLinearSize>::type LinearPacketType; using LinearPacketType = typename find_best_packet<DstScalar, RestrictedLinearSize>::type;
typedef typename find_best_packet<DstScalar, RestrictedInnerSize>::type InnerPacketType; using InnerPacketType = typename find_best_packet<DstScalar, RestrictedInnerSize>::type;
enum { static constexpr int LinearPacketSize = unpacket_traits<LinearPacketType>::size;
LinearPacketSize = unpacket_traits<LinearPacketType>::size, static constexpr int InnerPacketSize = unpacket_traits<InnerPacketType>::size;
InnerPacketSize = unpacket_traits<InnerPacketType>::size
};
public: public:
enum { static constexpr int LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment;
LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment, static constexpr int InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment;
InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
};
private: private:
enum { static constexpr bool StorageOrdersAgree = DstIsRowMajor == SrcIsRowMajor;
DstIsRowMajor = DstFlags & RowMajorBit, static constexpr bool MightVectorize = StorageOrdersAgree && bool(DstFlags & SrcFlags & ActualPacketAccessBit) &&
SrcIsRowMajor = SrcFlags & RowMajorBit, bool(functor_traits<AssignFunc>::PacketAccess);
StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)), static constexpr bool MayInnerVectorize = MightVectorize && (InnerSizeAtCompileTime != Dynamic) &&
MightVectorize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) && (InnerSizeAtCompileTime % InnerPacketSize == 0) &&
bool(functor_traits<AssignFunc>::PacketAccess), (OuterStride != Dynamic) && (OuterStride % InnerPacketSize == 0) &&
MayInnerVectorize = MightVectorize && int(InnerSize) != Dynamic && int(InnerSize) % int(InnerPacketSize) == 0 && (EIGEN_UNALIGNED_VECTORIZE || JointAlignment >= InnerRequiredAlignment),
int(OuterStride) != Dynamic && int(OuterStride) % int(InnerPacketSize) == 0 && MayLinearize = StorageOrdersAgree && (DstFlags & SrcFlags & LinearAccessBit),
(EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment) >= int(InnerRequiredAlignment)), MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess &&
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment) ||
MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && MaxSizeAtCompileTime == Dynamic);
(EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment) >= int(LinearRequiredAlignment)) || /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
MaxSizeAtCompileTime == Dynamic), so it's only good for large enough sizes. */
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll, static constexpr bool MaySliceVectorize =
so it's only good for large enough sizes. */ MightVectorize && DstHasDirectAccess &&
MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) && (MaxInnerSizeAtCompileTime == Dynamic ||
(int(InnerMaxSize) == Dynamic || MaxInnerSizeAtCompileTime >= (EIGEN_UNALIGNED_VECTORIZE ? InnerPacketSize : (3 * InnerPacketSize)));
int(InnerMaxSize) >= (EIGEN_UNALIGNED_VECTORIZE ? InnerPacketSize : (3 * InnerPacketSize))) /* slice vectorization can be slow, so we only want it if the slices are big, which is
/* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block in a fixed-size matrix
in a fixed-size matrix However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
};
public: public:
enum { static constexpr int Traversal = SizeAtCompileTime == 0 ? AllAtOnceTraversal
Traversal = int(Dst::SizeAtCompileTime) == 0 : (MayLinearVectorize && (LinearPacketSize > InnerPacketSize))
? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time. ? LinearVectorizedTraversal
: (int(MayLinearVectorize) && (LinearPacketSize > InnerPacketSize)) ? int(LinearVectorizedTraversal) : MayInnerVectorize ? InnerVectorizedTraversal
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : MayLinearVectorize ? LinearVectorizedTraversal
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : MaySliceVectorize ? SliceVectorizedTraversal
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal) : MayLinearize ? LinearTraversal
: int(MayLinearize) ? int(LinearTraversal) : DefaultTraversal;
: int(DefaultTraversal), static constexpr bool Vectorized = Traversal == InnerVectorizedTraversal || Traversal == LinearVectorizedTraversal ||
Vectorized = int(Traversal) == InnerVectorizedTraversal || int(Traversal) == LinearVectorizedTraversal || Traversal == SliceVectorizedTraversal;
int(Traversal) == SliceVectorizedTraversal
};
typedef std::conditional_t<int(Traversal) == LinearVectorizedTraversal, LinearPacketType, InnerPacketType> PacketType; using PacketType = std::conditional_t<Traversal == LinearVectorizedTraversal, LinearPacketType, InnerPacketType>;
private: private:
enum { static constexpr int ActualPacketSize = Vectorized ? unpacket_traits<PacketType>::size : 1;
ActualPacketSize = int(Traversal) == LinearVectorizedTraversal ? LinearPacketSize static constexpr int UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize;
: Vectorized ? InnerPacketSize static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost);
: 1, static constexpr bool MayUnrollCompletely =
UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
MayUnrollCompletely = static constexpr bool MayUnrollInner =
int(Dst::SizeAtCompileTime) != Dynamic && (InnerSizeAtCompileTime != Dynamic) && (InnerSizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost)) <=
int(UnrollingLimit),
MayUnrollInner =
int(InnerSize) != Dynamic &&
int(InnerSize) * (int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
};
public: public:
enum { static constexpr int Unrolling =
Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal)) (Traversal == InnerVectorizedTraversal || Traversal == DefaultTraversal)
? (int(MayUnrollCompletely) ? int(CompleteUnrolling) ? (MayUnrollCompletely ? CompleteUnrolling
: int(MayUnrollInner) ? int(InnerUnrolling) : MayUnrollInner ? InnerUnrolling
: int(NoUnrolling)) : NoUnrolling)
: int(Traversal) == int(LinearVectorizedTraversal) : Traversal == LinearVectorizedTraversal
? (bool(MayUnrollCompletely) && ? (MayUnrollCompletely && (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment))
(EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment) >= int(LinearRequiredAlignment))) ? CompleteUnrolling
? int(CompleteUnrolling) : NoUnrolling)
: int(NoUnrolling)) : Traversal == LinearTraversal ? (MayUnrollCompletely ? CompleteUnrolling : NoUnrolling)
: int(Traversal) == int(LinearTraversal)
? (bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling))
#if EIGEN_UNALIGNED_VECTORIZE #if EIGEN_UNALIGNED_VECTORIZE
: int(Traversal) == int(SliceVectorizedTraversal) : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
? (bool(MayUnrollInner) ? int(InnerUnrolling) : int(NoUnrolling))
#endif #endif
: int(NoUnrolling) : NoUnrolling;
};
#ifdef EIGEN_DEBUG_ASSIGN #ifdef EIGEN_DEBUG_ASSIGN
static void debug() { static void debug() {
@ -162,8 +148,8 @@ struct copy_using_evaluator_traits {
EIGEN_DEBUG_VAR(LinearRequiredAlignment) EIGEN_DEBUG_VAR(LinearRequiredAlignment)
EIGEN_DEBUG_VAR(InnerRequiredAlignment) EIGEN_DEBUG_VAR(InnerRequiredAlignment)
EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(JointAlignment)
EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerSizeAtCompileTime)
EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(MaxInnerSizeAtCompileTime)
EIGEN_DEBUG_VAR(LinearPacketSize) EIGEN_DEBUG_VAR(LinearPacketSize)
EIGEN_DEBUG_VAR(InnerPacketSize) EIGEN_DEBUG_VAR(InnerPacketSize)
EIGEN_DEBUG_VAR(ActualPacketSize) EIGEN_DEBUG_VAR(ActualPacketSize)
@ -196,17 +182,14 @@ struct copy_using_evaluator_traits {
*** Default traversal *** *** Default traversal ***
************************/ ************************/
template <typename Kernel, int Index, int Stop> template <typename Kernel, int Index_, int Stop>
struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling { struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling {
// FIXME: this is not very clean, perhaps this information should be provided by the kernel? static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
typedef typename Kernel::DstEvaluatorType DstEvaluatorType; static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
typedef typename DstEvaluatorType::XprType DstXprType;
enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime };
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
kernel.assignCoeffByOuterInner(outer, inner); kernel.assignCoeffByOuterInner(Outer, Inner);
copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index + 1, Stop>::run(kernel); copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
} }
}; };
@ -232,11 +215,11 @@ struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
*** Linear traversal *** *** Linear traversal ***
***********************/ ***********************/
template <typename Kernel, int Index, int Stop> template <typename Kernel, int Index_, int Stop>
struct copy_using_evaluator_LinearTraversal_CompleteUnrolling { struct copy_using_evaluator_LinearTraversal_CompleteUnrolling {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
kernel.assignCoeff(Index); kernel.assignCoeff(Index_);
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index + 1, Stop>::run(kernel); copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
} }
}; };
@ -249,23 +232,17 @@ struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop
*** Inner vectorization *** *** Inner vectorization ***
**************************/ **************************/
template <typename Kernel, int Index, int Stop> template <typename Kernel, int Index_, int Stop>
struct copy_using_evaluator_innervec_CompleteUnrolling { struct copy_using_evaluator_innervec_CompleteUnrolling {
// FIXME: this is not very clean, perhaps this information should be provided by the kernel? using PacketType = typename Kernel::PacketType;
typedef typename Kernel::DstEvaluatorType DstEvaluatorType; static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
typedef typename DstEvaluatorType::XprType DstXprType; static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
typedef typename Kernel::PacketType PacketType; static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
enum { static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
outer = Index / DstXprType::InnerSizeAtCompileTime,
inner = Index % DstXprType::InnerSizeAtCompileTime,
SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
DstAlignment = Kernel::AssignmentTraits::DstAlignment
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(Outer, Inner);
enum { NextIndex = Index + unpacket_traits<PacketType>::size };
copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel); copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
} }
}; };
@ -277,10 +254,11 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> {
template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment> template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
struct copy_using_evaluator_innervec_InnerUnrolling { struct copy_using_evaluator_innervec_InnerUnrolling {
typedef typename Kernel::PacketType PacketType; using PacketType = typename Kernel::PacketType;
static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_); kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel,
outer); outer);
} }
@ -308,9 +286,10 @@ struct dense_assignment_loop;
// Zero-sized assignment is a no-op. // Zero-sized assignment is a no-op.
template <typename Kernel, int Unrolling> template <typename Kernel, int Unrolling>
struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling> { struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling> {
static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) { EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) {
EIGEN_STATIC_ASSERT(int(Kernel::DstEvaluatorType::XprType::SizeAtCompileTime) == 0, EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
} }
}; };
@ -331,21 +310,21 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling> {
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling> { struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling> {
static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
} }
}; };
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> { struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { static constexpr int InnerSizeAtCompileTime = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
const Index outerSize = kernel.outerSize(); const Index outerSize = kernel.outerSize();
for (Index outer = 0; outer < outerSize; ++outer) for (Index outer = 0; outer < outerSize; ++outer)
copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSizeAtCompileTime>::run(kernel, outer);
outer);
} }
}; };
@ -380,18 +359,15 @@ struct unaligned_dense_assignment_loop<false> {
} }
}; };
template <typename Kernel, int Index, int Stop> template <typename Kernel, int Index_, int Stop>
struct copy_using_evaluator_linearvec_CompleteUnrolling { struct copy_using_evaluator_linearvec_CompleteUnrolling {
// FIXME: this is not very clean, perhaps this information should be provided by the kernel? using PacketType = typename Kernel::PacketType;
typedef typename Kernel::DstEvaluatorType DstEvaluatorType; static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
typedef typename DstEvaluatorType::XprType DstXprType; static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
typedef typename Kernel::PacketType PacketType; static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
enum { SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, DstAlignment = Kernel::AssignmentTraits::DstAlignment };
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(Index); kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(Index_);
enum { NextIndex = Index + unpacket_traits<PacketType>::size };
copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel); copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
} }
}; };
@ -403,26 +379,24 @@ struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> { struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> {
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment;
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
static constexpr int DstAlignment =
packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
const Index size = kernel.size(); const Index size = kernel.size();
typedef typename Kernel::Scalar Scalar; const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
typedef typename Kernel::PacketType PacketType; const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
enum {
requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
packetSize = unpacket_traits<PacketType>::size,
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment) >= int(requestedAlignment),
dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
: int(Kernel::AssignmentTraits::DstAlignment),
srcAlignment = Kernel::AssignmentTraits::JointAlignment
};
const Index alignedStart =
dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + ((size - alignedStart) / packetSize) * packetSize;
unaligned_dense_assignment_loop<dstIsAligned != 0>::run(kernel, 0, alignedStart); unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart);
for (Index index = alignedStart; index < alignedEnd; index += packetSize) for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index); kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
} }
@ -430,18 +404,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> {
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling> { struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
typedef typename Kernel::PacketType PacketType; copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel);
enum {
size = DstXprType::SizeAtCompileTime,
packetSize = unpacket_traits<PacketType>::size,
alignedSize = (int(size) / packetSize) * packetSize
};
copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
} }
}; };
@ -451,35 +421,40 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> { struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> {
typedef typename Kernel::PacketType PacketType; using PacketType = typename Kernel::PacketType;
enum { SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, DstAlignment = Kernel::AssignmentTraits::DstAlignment }; static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
const Index innerSize = kernel.innerSize(); const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize(); const Index outerSize = kernel.outerSize();
const Index packetSize = unpacket_traits<PacketType>::size;
for (Index outer = 0; outer < outerSize; ++outer) for (Index outer = 0; outer < outerSize; ++outer)
for (Index inner = 0; inner < innerSize; inner += packetSize) for (Index inner = 0; inner < innerSize; inner += PacketSize)
kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
} }
}; };
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling> { struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
} }
}; };
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> { struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::AssignmentTraits Traits;
const Index outerSize = kernel.outerSize(); const Index outerSize = kernel.outerSize();
for (Index outer = 0; outer < outerSize; ++outer) for (Index outer = 0; outer < outerSize; ++outer)
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime, Traits::SrcAlignment, copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(kernel,
Traits::DstAlignment>::run(kernel, outer); outer);
} }
}; };
@ -498,8 +473,8 @@ struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> {
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> { struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel); kernel);
} }
}; };
@ -509,42 +484,40 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> {
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> { struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> {
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment;
static constexpr bool Alignable =
packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar);
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
typedef typename Kernel::Scalar Scalar;
typedef typename Kernel::PacketType PacketType;
enum {
packetSize = unpacket_traits<PacketType>::size,
requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
alignable =
packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment) >= sizeof(Scalar),
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment) >= int(requestedAlignment),
dstAlignment = alignable ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment)
};
const Scalar* dst_ptr = kernel.dstDataPtr(); const Scalar* dst_ptr = kernel.dstDataPtr();
if ((!bool(dstIsAligned)) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) { if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
// the pointer is not aligned-on scalar, so alignment is not possible // the pointer is not aligned-on scalar, so alignment is not possible
return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel); return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
} }
const Index packetAlignedMask = packetSize - 1;
const Index innerSize = kernel.innerSize(); const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize(); const Index outerSize = kernel.outerSize();
const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0; const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
Index alignedStart = Index alignedStart =
((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize); ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize);
for (Index outer = 0; outer < outerSize; ++outer) { for (Index outer = 0; outer < outerSize; ++outer) {
const Index alignedEnd = alignedStart + ((innerSize - alignedStart) & ~packetAlignedMask); const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
// do the non-vectorizable part of the assignment // do the non-vectorizable part of the assignment
for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner); for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
// do the vectorizable part of the assignment // do the vectorizable part of the assignment
for (Index inner = alignedStart; inner < alignedEnd; inner += packetSize) for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner); kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner);
// do the non-vectorizable part of the assignment // do the non-vectorizable part of the assignment
for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner); for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
alignedStart = numext::mini((alignedStart + alignedStep) % packetSize, innerSize); alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
} }
} }
}; };
@ -552,20 +525,15 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> {
#if EIGEN_UNALIGNED_VECTORIZE #if EIGEN_UNALIGNED_VECTORIZE
template <typename Kernel> template <typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> { struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType;
enum {
innerSize = DstXprType::InnerSizeAtCompileTime,
packetSize = unpacket_traits<PacketType>::size,
vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
size = DstXprType::SizeAtCompileTime
};
for (Index outer = 0; outer < kernel.outerSize(); ++outer) { for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer); copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer);
copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer); copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer);
} }
} }
}; };