masked load/store framework

This commit is contained in:
Charles Schlosser 2025-04-12 00:31:10 +00:00
parent cebe09110c
commit 28c3b26d53
17 changed files with 1363 additions and 177 deletions

View File

@ -136,6 +136,8 @@ struct copy_using_evaluator_traits {
: Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
#endif
: NoUnrolling;
static constexpr bool UsePacketSegment =
enable_packet_segment<Src>::value && enable_packet_segment<Dst>::value && has_packet_segment<PacketType>::value;
#ifdef EIGEN_DEBUG_ASSIGN
static void debug() {
@ -273,6 +275,33 @@ struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlign
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
};
template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
struct copy_using_evaluator_innervec_segment {
using PacketType = typename Kernel::PacketType;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
Stop - Start);
}
};
template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
/*UsePacketSegment*/ false>
: copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
/*UsePacketSegment*/ true> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
};
template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
/*UsePacketSegment*/ false> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
};
/***************************************************************************
* Part 3 : implementation of all cases
***************************************************************************/
@ -353,28 +382,48 @@ struct dense_assignment_loop_impl<Kernel, DefaultTraversal, InnerUnrolling> {
// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
// of the non vectorizable beginning and ending parts
template <bool IsAligned = false>
template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
struct unaligned_dense_assignment_loop {
// if IsAligned = true, then do nothing
// if Skip == true, then do nothing
template <typename Kernel>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*start*/,
Index /*end*/) {}
template <typename Kernel>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*outer*/,
Index /*innerStart*/, Index /*innerEnd*/) {}
};
template <>
struct unaligned_dense_assignment_loop<false> {
// MSVC must not inline this functions. If it does, it fails to optimize the
// packet access path.
// FIXME check which version exhibits this issue
#if EIGEN_COMP_MSVC
template <typename PacketType, int DstAlignment, int SrcAlignment>
struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
/*Skip*/ false> {
template <typename Kernel>
static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end)
#else
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) {
Index count = end - start;
eigen_assert(count <= unpacket_traits<PacketType>::size);
if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
}
template <typename Kernel>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end)
#endif
{
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index start,
Index end) {
Index count = end - start;
eigen_assert(count <= unpacket_traits<PacketType>::size);
if (count > 0)
kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
}
};
template <typename PacketType, int DstAlignment, int SrcAlignment>
struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
/*Skip*/ false> {
template <typename Kernel>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) {
for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
}
template <typename Kernel>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index innerStart,
Index innerEnd) {
for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
}
};
template <typename Kernel, int Index_, int Stop>
@ -395,28 +444,60 @@ struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
};
template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
struct copy_using_evaluator_linearvec_segment {
using PacketType = typename Kernel::PacketType;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
}
};
template <typename Kernel, int Index_, int Stop>
struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
: copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
template <typename Kernel, int Stop>
struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
};
template <typename Kernel, int Stop>
struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
};
template <typename Kernel>
struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling> {
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment;
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
static constexpr int DstAlignment =
packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
static constexpr bool Alignable =
(DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
static constexpr bool DstIsAligned = DstAlignment >= Alignment;
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
using head_loop =
unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
const Index size = kernel.size();
const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart);
head_loop::run(kernel, 0, alignedStart);
for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
tail_loop::run(kernel, alignedEnd, size);
}
};
@ -426,10 +507,11 @@ struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, CompleteUnr
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel);
copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
}
};
@ -505,35 +587,35 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling>
using Scalar = typename Kernel::Scalar;
using PacketType = typename Kernel::PacketType;
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment;
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
static constexpr bool Alignable =
packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar);
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
(DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
static constexpr bool DstIsAligned = DstAlignment >= Alignment;
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
const Scalar* dst_ptr = kernel.dstDataPtr();
if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
// the pointer is not aligned-on scalar, so alignment is not possible
return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
}
const Index innerSize = kernel.innerSize();
const Index outerSize = kernel.outerSize();
const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
Index alignedStart =
((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize);
Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
for (Index outer = 0; outer < outerSize; ++outer) {
const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
// do the non-vectorizable part of the assignment
for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
head_loop::run(kernel, outer, 0, alignedStart);
// do the vectorizable part of the assignment
for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner);
kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
// do the non-vectorizable part of the assignment
for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
tail_loop::run(kernel, outer, alignedEnd, innerSize);
alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
}
@ -547,11 +629,16 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolli
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
Unaligned, UsePacketSegment>;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer);
copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer);
packet_loop::run(kernel, outer);
packet_segment_loop::run(kernel, outer);
}
}
};
@ -635,6 +722,27 @@ class generic_dense_assignment_kernel {
assignPacket<StoreMode, LoadMode, Packet>(row, col);
}
template <int StoreMode, int LoadMode, typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
m_functor.template assignPacketSegment<StoreMode>(
&m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
count);
}
template <int StoreMode, int LoadMode, typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
m_functor.template assignPacketSegment<StoreMode>(
&m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
}
template <int StoreMode, int LoadMode, typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
Index count) {
Index row = rowIndexByOuterInner(outer, inner);
Index col = colIndexByOuterInner(outer, inner);
assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
typedef typename DstEvaluatorType::ExpressionTraits Traits;
return int(Traits::RowsAtCompileTime) == 1 ? 0

View File

@ -198,19 +198,13 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
if (IsRowMajor)
return m_d.data[row * m_d.outerStride() + col];
else
return m_d.data[row + col * m_d.outerStride()];
return coeff(getIndex(row, col));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
if (IsRowMajor)
return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
else
return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
return coeffRef(getIndex(row, col));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
@ -219,10 +213,7 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
if (IsRowMajor)
return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
else
return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
return packet<LoadMode, PacketType>(getIndex(row, col));
}
template <int LoadMode, typename PacketType>
@ -232,19 +223,43 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
if (IsRowMajor)
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
else
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
writePacket<StoreMode, PacketType>(getIndex(row, col), x);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
}
protected:
plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndex(Index row, Index col) const {
return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
}
};
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@ -318,6 +333,28 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpos
m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
}
protected:
evaluator<ArgType> m_argImpl;
};
@ -464,10 +501,10 @@ template <typename NullaryOp, typename PlainObjectType>
struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
: evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
enum {
CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
CoeffReadCost = functor_traits<NullaryOp>::Cost,
Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
(HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
@ -502,9 +539,21 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
return m_wrapper.template packetOp<PacketType>(m_functor, index);
}
template <int LoadMode, typename PacketType, typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
Index /*count*/) const {
return packet<LoadMode, PacketType, IndexType>(row, col);
}
template <int LoadMode, typename PacketType, typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
Index /*count*/) const {
return packet<LoadMode, PacketType, IndexType>(index);
}
protected:
const NullaryOp m_functor;
const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
};
// -------------------- CwiseUnaryOp --------------------
@ -546,6 +595,16 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_b
return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
}
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@ -600,16 +659,11 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
template <typename DstPacketType>
using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const {
return col + packetSize <= cols();
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
}
template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const {
return row + packetSize <= rows();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const {
return index + packetSize <= size();
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
return index + count + begin <= size();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
@ -632,43 +686,86 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualRow = IsRowMajor ? row : row + (offset * PacketSize);
Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col;
eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds");
Index packetOffset = offset * PacketSize;
Index actualRow = IsRowMajor ? row : row + packetOffset;
Index actualCol = IsRowMajor ? col + packetOffset : col;
eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
}
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualIndex = index + (offset * PacketSize);
eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds");
Index packetOffset = offset * PacketSize;
Index actualIndex = index + packetOffset;
eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
}
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index packetOffset = offset * PacketSize;
Index actualRow = IsRowMajor ? row : row + packetOffset;
Index actualCol = IsRowMajor ? col + packetOffset : col;
eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds");
return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
}
template <int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
Index offset) const {
constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index packetOffset = offset * PacketSize;
Index actualIndex = index + packetOffset + begin;
eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds");
return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
}
template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
Index begin,
Index count) const {
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<PacketType, NumPackets> packets;
Index offset = begin / SrcPacketSize;
Index actualBegin = begin % SrcPacketSize;
for (; offset < NumPackets; offset++) {
Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
if (count == actualCount) break;
actualBegin = 0;
count -= actualCount;
}
return packets;
}
template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
Index begin,
Index count) const {
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<PacketType, NumPackets> packets;
Index offset = begin / SrcPacketSize;
Index actualBegin = begin % SrcPacketSize;
for (; offset < NumPackets; offset++) {
Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
if (count == actualCount) break;
actualBegin = 0;
count -= actualCount;
}
return packets;
}
// There is no source packet type with equal or fewer elements than DstPacketType.
// This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
// For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
// The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
// is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
// Instead, perform runtime check to determine if the load would access data outside the bounds of the array.
// If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load.
// In either case, perform a vectorized cast of the source packet.
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
SrcPacketType src;
if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) {
src = srcPacket<SrcLoadMode>(row, col, 0);
} else {
Array<SrcType, SrcPacketSize, 1> srcArray;
for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k);
for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
src = pload<SrcPacketType>(srcArray.data());
}
return pcast<SrcPacketType, DstPacketType>(src);
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
}
// Use the source packet type with the same size as DstPacketType, if it exists
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
@ -704,22 +801,67 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
}
// packetSegment variants
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
}
// Use the source packet type with the same size as DstPacketType, if it exists
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
return pcast<SizedSrcPacketType, DstPacketType>(
srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
}
// unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
constexpr int NumPackets = 2;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
}
// unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
constexpr int NumPackets = 4;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
packets.packet[3]);
}
// unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
constexpr int NumPackets = 8;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
packets.packet[3], packets.packet[4], packets.packet[5],
packets.packet[6], packets.packet[7]);
}
// Analogous routines for linear access.
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
SrcPacketType src;
if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) {
src = srcPacket<SrcLoadMode>(index, 0);
} else {
Array<SrcType, SrcPacketSize, 1> srcArray;
for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k);
for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
src = pload<SrcPacketType>(srcArray.data());
}
return pcast<SrcPacketType, DstPacketType>(src);
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
}
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
@ -749,6 +891,55 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
}
// packetSegment variants
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
}
// Use the source packet type with the same size as DstPacketType, if it exists
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
return pcast<SizedSrcPacketType, DstPacketType>(
srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
}
// unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
constexpr int NumPackets = 2;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
}
// unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
constexpr int NumPackets = 4;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
packets.packet[3]);
}
// unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
constexpr int NumPackets = 8;
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<SrcPacketType, NumPackets> packets =
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
packets.packet[3], packets.packet[4], packets.packet[5],
packets.packet[6], packets.packet[7]);
}
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
@ -826,6 +1017,20 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
}
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@ -922,6 +1127,18 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
}
protected:
// this helper permits to completely eliminate the functor if it is empty
struct Data {
@ -1013,7 +1230,7 @@ struct mapbase_evaluator : evaluator_base<Derived> {
m_innerStride(map.innerStride()),
m_outerStride(map.outerStride()) {
EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
internal::inner_stride_at_compile_time<Derived>::ret == 1),
inner_stride_at_compile_time<Derived>::ret == 1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
@ -1035,23 +1252,47 @@ struct mapbase_evaluator : evaluator_base<Derived> {
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::ploadt<PacketType, LoadMode>(ptr);
return ploadt<PacketType, LoadMode>(ptr);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
pstoret<Scalar, PacketType, StoreMode>(ptr, x);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
PointerType ptr = m_data + row * rowStride() + col * colStride();
return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
PointerType ptr = m_data + row * rowStride() + col * colStride();
pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
}
protected:
@ -1063,8 +1304,8 @@ struct mapbase_evaluator : evaluator_base<Derived> {
}
PointerType m_data;
const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
};
template <typename PlainObjectType, int MapOptions, typename StrideType>
@ -1117,7 +1358,7 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType>>
// -------------------- Block --------------------
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
bool HasDirectAccess = internal::has_direct_access<ArgType>::ret>
bool HasDirectAccess = has_direct_access<ArgType>::ret>
struct block_evaluator;
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
@ -1246,6 +1487,39 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
x);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
if (ForwardLinearAccess)
return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
else
return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
m_startCol.value() + col, x, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
if (ForwardLinearAccess)
return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
count);
else
return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0, x, begin, count);
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
@ -1341,8 +1615,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
typedef typename XprType::CoeffReturnType CoeffReturnType;
enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested;
typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
enum {
CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
@ -1361,19 +1635,15 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
// try to avoid using modulo; this is a pure optimization strategy
const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
: RowFactor == 1 ? row
: row % m_rows.value();
const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
: ColFactor == 1 ? col
: col % m_cols.value();
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
return m_argImpl.coeff(actual_row, actual_col);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
// try to avoid using modulo; this is a pure optimization strategy
const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
? (ColFactor == 1 ? index : index % m_cols.value())
: (RowFactor == 1 ? index : index % m_rows.value());
@ -1382,25 +1652,38 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
: RowFactor == 1 ? row
: row % m_rows.value();
const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
: ColFactor == 1 ? col
: col % m_cols.value();
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
? (ColFactor == 1 ? index : index % m_cols.value())
: (RowFactor == 1 ? index : index % m_rows.value());
return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
? (ColFactor == 1 ? index : index % m_cols.value())
: (RowFactor == 1 ? index : index % m_rows.value());
return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
}
protected:
const ArgTypeNested m_arg;
evaluator<ArgTypeNestedCleaned> m_argImpl;
@ -1457,6 +1740,28 @@ struct evaluator_wrapper_base : evaluator_base<XprType> {
m_argImpl.template writePacket<StoreMode>(index, x);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
}
template <int StoreMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
}
protected:
evaluator<ArgType> m_argImpl;
};
@ -1536,41 +1841,97 @@ struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<Arg
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
enum {
PacketSize = unpacket_traits<PacketType>::size,
OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
};
typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(
ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col));
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
enum { PacketSize = unpacket_traits<PacketType>::size };
return preverse(
m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
// FIXME we could factorize some code with packet(i,j)
enum {
PacketSize = unpacket_traits<PacketType>::size,
OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
};
typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row,
ReverseCol ? m_cols.value() - col - OffsetCol : col,
reverse_packet::run(x));
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
enum { PacketSize = unpacket_traits<PacketType>::size };
m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
return reverse_packet::run(
m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
Index actualBegin = PacketSize - count - begin;
return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
Index count) {
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
Index count) {
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
Index actualBegin = PacketSize - count - begin;
m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
}
protected:
@ -1621,7 +1982,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType
protected:
evaluator<ArgType> m_argImpl;
const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const {

View File

@ -1562,6 +1562,72 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
return (Packet)pand(result, peven_mask(result)); // atan2 0 atan2 0 ...
}
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
* outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
Index count) {
using Scalar = typename unpacket_traits<Packet>::type;
constexpr Index PacketSize = unpacket_traits<Packet>::size;
eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
Scalar aux[PacketSize];
smart_copy(from + begin, from + begin + count, aux + begin);
return ploadu<Packet>(aux);
}
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
* outside this range are not defined. \a *from must be aligned, and cannot be null.*/
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
Index count) {
return ploaduSegment<Packet>(from, begin, count);
}
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
null if \a count is zero.*/
template <typename Scalar, typename Packet>
EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
constexpr Index PacketSize = unpacket_traits<Packet>::size;
eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
Scalar aux[PacketSize];
pstoreu<Scalar, Packet>(aux, from);
smart_copy(aux + begin, aux + begin + count, to + begin);
}
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
null.*/
template <typename Scalar, typename Packet>
EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
return pstoreuSegment(to, from, begin, count);
}
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
* outside this range are not defined.*/
template <typename Packet, int Alignment>
EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
Index count) {
constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
if (Alignment >= RequiredAlignment) {
return ploadSegment<Packet>(from, begin, count);
} else {
return ploaduSegment<Packet>(from, begin, count);
}
}
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
Elements outside of the range [begin, begin + count) are not defined.*/
template <typename Scalar, typename Packet, int Alignment>
EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
if (Alignment >= RequiredAlignment) {
pstoreSegment<Scalar, Packet>(to, from, begin, count);
} else {
pstoreuSegment<Scalar, Packet>(to, from, begin, count);
}
}
#ifndef EIGEN_NO_IO
template <typename Packet>

View File

@ -283,7 +283,7 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, cons
template <typename Lhs, typename Rhs>
struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
template <typename T>
struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {};
struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
@ -445,7 +445,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductM
eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
std::conditional_t<HasScalarFactor, true_type, false_type>());
bool_constant<HasScalarFactor>());
}
protected:
@ -635,6 +635,24 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
return packet<LoadMode, PacketType>(row, col);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
Index count) const {
PacketType res;
typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
PacketImpl;
PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
return res;
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
return packetSegment<LoadMode, PacketType>(row, col, begin, count);
}
protected:
add_const_on_value_type_t<LhsNested> m_lhs;
add_const_on_value_type_t<RhsNested> m_rhs;
@ -670,6 +688,13 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index innerDim, Packet& res, Index begin, Index count) {
etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
row, col, lhs, rhs, innerDim, res, begin, count);
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
}
};
template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -681,6 +706,13 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index innerDim, Packet& res, Index begin, Index count) {
etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
row, col, lhs, rhs, innerDim, res, begin, count);
res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -689,6 +721,12 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
Index /*innerDim*/, Packet& res) {
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index /*innerDim*/, Packet& res, Index begin,
Index count) {
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -697,6 +735,12 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
Index /*innerDim*/, Packet& res) {
res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index /*innerDim*/, Packet& res, Index begin,
Index count) {
res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
pset1<Packet>(rhs.coeff(Index(0), col)));
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -705,6 +749,11 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> {
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
Index /*begin*/, Index /*count*/) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -713,6 +762,11 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> {
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
Index /*begin*/, Index /*count*/) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -723,6 +777,13 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
for (Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index innerDim, Packet& res, Index begin, Index count) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for (Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
res);
}
};
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@ -733,6 +794,13 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
for (Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
Index innerDim, Packet& res, Index begin, Index count) {
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for (Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
res);
}
};
/***************************************************************************
@ -871,6 +939,26 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
internal::true_type) const {
return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
internal::pset1<PacketType>(m_diagImpl.coeff(id)));
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
internal::false_type) const {
enum {
InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
DiagonalPacketLoadMode = plain_enum_min(
LoadMode,
((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
};
return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
}
evaluator<DiagonalType> m_diagImpl;
evaluator<MatrixType> m_matImpl;
};
@ -892,7 +980,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
typedef typename XprType::PlainObject PlainObject;
typedef typename Lhs::DiagonalVectorType DiagonalType;
enum { StorageOrder = Base::StorageOrder_ };
static constexpr int StorageOrder = Base::StorageOrder_;
using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
@ -905,8 +994,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
// FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
// See also similar calls below.
return this->template packet_impl<LoadMode, PacketType>(
row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>());
return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
}
template <int LoadMode, typename PacketType>
@ -914,6 +1002,19 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
int(StorageOrder) == ColMajor ? 0 : idx);
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
// FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
// See also similar calls below.
return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
begin, count);
}
#endif
};
@ -933,7 +1034,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
typedef Product<Lhs, Rhs, ProductKind> XprType;
typedef typename XprType::PlainObject PlainObject;
enum { StorageOrder = Base::StorageOrder_ };
static constexpr int StorageOrder = Base::StorageOrder_;
using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
@ -944,14 +1046,23 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
#ifndef EIGEN_GPUCC
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
return this->template packet_impl<LoadMode, PacketType>(
row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>());
return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
int(StorageOrder) == ColMajor ? 0 : idx);
return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
}
template <int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
begin, count);
}
#endif
};

View File

@ -65,6 +65,31 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
Index col = Base::colIndexByOuterInner(outer, inner);
assignPacket<StoreMode, LoadMode, PacketType>(row, col);
}
template <int StoreMode, int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
}
template <int StoreMode, int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
}
// TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
// mean no CRTP (Gael)
template <int StoreMode, int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
Index row = Base::rowIndexByOuterInner(outer, inner);
Index col = Base::colIndexByOuterInner(outer, inner);
assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
}
};
} // namespace internal

View File

@ -36,6 +36,10 @@ template <typename MatrixType, typename MemberOp, int Direction>
class PartialReduxExpr;
namespace internal {
template <typename ArgType, typename MemberOp, int Direction>
struct enable_packet_segment<PartialReduxExpr<ArgType, MemberOp, Direction>> : std::false_type {};
template <typename MatrixType, typename MemberOp, int Direction>
struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
typedef typename MemberOp::result_type Scalar;

View File

@ -507,6 +507,57 @@ EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, con
return pnegate(pmadd(a, b, c));
}
#endif
/*---------------- load/store segment support ----------------*/
/*---------------- std::complex<float> ----------------*/
template <>
struct has_packet_segment<Packet2cf> : std::true_type {};
template <>
struct has_packet_segment<Packet4cf> : std::true_type {};
template <>
inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
}
template <>
inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
Index count) {
_mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
}
template <>
inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
}
template <>
inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
Index count) {
_mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
}
/*---------------- std::complex<double> ----------------*/
template <>
struct has_packet_segment<Packet2cd> : std::true_type {};
template <>
inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
}
template <>
inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
Index begin, Index count) {
_mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
}
/*---------------- end load/store segment support ----------------*/
} // end namespace internal
} // end namespace Eigen

View File

@ -2938,6 +2938,258 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
}
/*---------------- load/store segment support ----------------*/
// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m128i segment_mask_4x8(Index begin, Index count) {
eigen_assert(begin >= 0 && begin + count <= 4);
long long mask = 1;
mask <<= CHAR_BIT * count;
mask--;
mask <<= CHAR_BIT * begin;
#if defined(_WIN32) && !defined(_WIN64)
return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
#else
return _mm_cvtsi64_si128(mask);
#endif
}
// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m128i segment_mask_8x8(Index begin, Index count) {
eigen_assert(begin >= 0 && begin + count <= 8);
long long mask = 1;
// avoid UB when count == 8
mask <<= (CHAR_BIT / 2) * count;
mask <<= (CHAR_BIT / 2) * count;
mask--;
mask <<= CHAR_BIT * begin;
#if defined(_WIN32) && !defined(_WIN64)
return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
#else
return _mm_cvtsi64_si128(mask);
#endif
}
// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m128i segment_mask_4x32(Index begin, Index count) {
eigen_assert(begin >= 0 && begin + count <= 4);
return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
}
// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m128i segment_mask_2x64(Index begin, Index count) {
eigen_assert(begin >= 0 && begin + count <= 2);
return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
}
// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m256i segment_mask_8x32(Index begin, Index count) {
__m128i mask_epi8 = segment_mask_8x8(begin, count);
#ifdef EIGEN_VECTORIZE_AVX2
__m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
#else
__m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
__m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
__m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
#endif
return mask_epi32;
}
// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
inline __m256i segment_mask_4x64(Index begin, Index count) {
__m128i mask_epi8 = segment_mask_4x8(begin, count);
#ifdef EIGEN_VECTORIZE_AVX2
__m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
#else
__m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
__m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
__m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
#endif
return mask_epi64;
}
/*---------------- float ----------------*/
template <>
struct has_packet_segment<Packet4f> : std::true_type {};
template <>
struct has_packet_segment<Packet8f> : std::true_type {};
template <>
inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
}
template <>
inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
_mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
}
template <>
inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
}
template <>
inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
_mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
}
/*---------------- int32 ----------------*/
template <>
struct has_packet_segment<Packet4i> : std::true_type {};
template <>
struct has_packet_segment<Packet8i> : std::true_type {};
#ifdef EIGEN_VECTORIZE_AVX2
template <>
inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
}
template <>
inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
_mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
}
template <>
inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
}
template <>
inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
_mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
}
#else
template <>
inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
}
template <>
inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
}
template <>
inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
}
template <>
inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
}
#endif
/*---------------- uint32 ----------------*/
template <>
struct has_packet_segment<Packet4ui> : std::true_type {};
template <>
struct has_packet_segment<Packet8ui> : std::true_type {};
template <>
inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
}
template <>
inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
}
template <>
inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
}
template <>
inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
}
/*---------------- double ----------------*/
template <>
struct has_packet_segment<Packet2d> : std::true_type {};
template <>
struct has_packet_segment<Packet4d> : std::true_type {};
template <>
inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
}
template <>
inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
_mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
}
template <>
inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
}
template <>
inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
_mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
}
#ifdef EIGEN_VECTORIZE_AVX2
/*---------------- int64_t ----------------*/
template <>
struct has_packet_segment<Packet2l> : std::true_type {};
template <>
struct has_packet_segment<Packet4l> : std::true_type {};
template <>
inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
}
template <>
inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
_mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
}
template <>
inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
}
template <>
inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
_mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
}
/*---------------- uint64_t ----------------*/
template <>
struct has_packet_segment<Packet4ul> : std::true_type {};
template <>
inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
}
template <>
inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
}
#endif
/*---------------- end load/store segment support ----------------*/
} // end namespace internal
} // end namespace Eigen

View File

@ -29,6 +29,11 @@ struct assign_op {
EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
pstoret<DstScalar, Packet, Alignment>(a, b);
}
template <int Alignment, typename Packet>
EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
}
};
// Empty overload for void type (used by PermutationMatrix)
@ -60,6 +65,12 @@ struct compound_assign_op {
assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
}
template <int Alignment, typename Packet>
EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
}
};
template <typename DstScalar, typename SrcScalar, typename Func>

View File

@ -438,7 +438,6 @@ struct scalar_quotient_op : binary_op_base<LhsScalar, RhsScalar> {
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
maybe_raise_div_by_zero<Packet>::run(b);
return internal::pdiv(a, b);
}
};

View File

@ -28,7 +28,7 @@ struct scalar_constant_op {
const Scalar m_other;
};
template <typename Scalar>
struct functor_traits<scalar_constant_op<Scalar> > {
struct functor_traits<scalar_constant_op<Scalar>> {
enum {
Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
PacketAccess = packet_traits<Scalar>::Vectorizable,
@ -56,7 +56,7 @@ struct scalar_identity_op {
}
};
template <typename Scalar>
struct functor_traits<scalar_identity_op<Scalar> > {
struct functor_traits<scalar_identity_op<Scalar>> {
enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
};
@ -86,18 +86,19 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
// Principle:
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
Packet low = pset1<Packet>(m_low);
Packet high = pset1<Packet>(m_high);
Packet step = pset1<Packet>(m_step);
if (m_flip) {
Packet pi = plset<Packet>(Scalar(i - m_size1));
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
if (EIGEN_PREDICT_TRUE(i != 0)) return res;
Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
return pselect<Packet>(mask, res, pset1<Packet>(m_low));
Packet res = pmadd(step, pi, high);
Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
return pselect<Packet>(mask, res, low);
} else {
Packet pi = plset<Packet>(Scalar(i));
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res;
Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1));
return pselect<Packet>(mask, res, pset1<Packet>(m_high));
Packet res = pmadd(step, pi, low);
Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
return pselect<Packet>(mask, res, high);
}
}
@ -139,7 +140,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
template <typename Scalar>
struct linspaced_op;
template <typename Scalar>
struct functor_traits<linspaced_op<Scalar> > {
struct functor_traits<linspaced_op<Scalar>> {
enum {
Cost = 1,
PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
@ -192,7 +193,7 @@ struct equalspaced_op {
};
template <typename Scalar>
struct functor_traits<equalspaced_op<Scalar> > {
struct functor_traits<equalspaced_op<Scalar>> {
enum {
Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
PacketAccess =

View File

@ -514,6 +514,12 @@ template <typename Xpr>
struct eigen_memset_helper;
template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
struct eigen_zero_impl;
template <typename Packet>
struct has_packet_segment : std::false_type {};
template <typename Xpr>
struct enable_packet_segment : std::true_type {};
} // namespace internal
} // end namespace Eigen

View File

@ -90,12 +90,8 @@ namespace internal {
* we however don't want to add a dependency to Boost.
*/
struct true_type {
enum { value = 1 };
};
struct false_type {
enum { value = 0 };
};
using std::false_type;
using std::true_type;
template <bool Condition>
struct bool_constant;

View File

@ -996,6 +996,27 @@ struct is_matrix_base_xpr : std::is_base_of<MatrixBase<remove_all_t<XprType>>, r
template <typename XprType>
struct is_permutation_base_xpr : std::is_base_of<PermutationBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
/*---------------- load/store segment support ----------------*/
// recursively traverse unary, binary, and ternary expressions to determine if packet segments are supported
template <typename Func, typename Xpr>
struct enable_packet_segment<CwiseNullaryOp<Func, Xpr>> : enable_packet_segment<remove_all_t<Xpr>> {};
template <typename Func, typename Xpr>
struct enable_packet_segment<CwiseUnaryOp<Func, Xpr>> : enable_packet_segment<remove_all_t<Xpr>> {};
template <typename Func, typename LhsXpr, typename RhsXpr>
struct enable_packet_segment<CwiseBinaryOp<Func, LhsXpr, RhsXpr>>
: bool_constant<enable_packet_segment<remove_all_t<LhsXpr>>::value &&
enable_packet_segment<remove_all_t<RhsXpr>>::value> {};
template <typename Func, typename LhsXpr, typename MidXpr, typename RhsXpr>
struct enable_packet_segment<CwiseTernaryOp<Func, LhsXpr, MidXpr, RhsXpr>>
: bool_constant<enable_packet_segment<remove_all_t<LhsXpr>>::value &&
enable_packet_segment<remove_all_t<MidXpr>>::value &&
enable_packet_segment<remove_all_t<RhsXpr>>::value> {};
} // end namespace internal
/** \class ScalarBinaryOpTraits

View File

@ -308,19 +308,24 @@ struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVec
this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
}
};
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
using head_loop =
unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
const Index size = kernel.size();
const Index alignedStart =
DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);
head_loop::run(kernel, 0, alignedStart);
constexpr float cost = static_cast<float>(XprEvaluationCost);
AssignmentFunctor functor(kernel);
device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
tail_loop::run(kernel, alignedEnd, size);
}
};

View File

@ -186,6 +186,7 @@ ei_add_test(mixingtypes)
ei_add_test(float_conversion)
ei_add_test(io)
ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
ei_add_test(packet_segment)
ei_add_test(vectorization_logic)
ei_add_test(basicstuff)
ei_add_test(constexpr)

168
test/packet_segment.cpp Normal file
View File

@ -0,0 +1,168 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 The Eigen Authors
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
template <typename Scalar, typename Packet>
void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) {
constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
bool ok = true;
for (Index i = begin; i < begin + count; i++) {
ok = ok && numext::equal_strict(data_in[i], data_out[i]);
}
if (!ok) {
std::cout << "begin: " << begin << ", count: " << count << "\n";
std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n";
std::cout << "data in: {";
for (Index i = 0; i < PacketSize; i++) {
if (i > 0) std::cout << ",";
if (i < begin || i >= begin + count) {
std::cout << "MASK";
} else {
std::cout << data_in[i];
}
}
std::cout << "}\n";
std::cout << "data out: {";
for (Index i = 0; i < PacketSize; i++) {
if (i > 0) std::cout << ",";
if (i < begin || i >= begin + count) {
std::cout << "MASK";
} else {
std::cout << data_out[i];
}
}
std::cout << "}\n";
std::cout << "packet: ";
std::cout << internal::postream(a) << "\n";
}
VERIFY(ok);
}
template <typename Scalar, int PacketSize, bool Run = internal::find_packet_by_size<Scalar, PacketSize>::value>
struct packet_segment_test_impl {
using Packet = typename internal::find_packet_by_size<Scalar, PacketSize>::type;
static void test_unaligned() {
// test loading a packet segment from unaligned memory that includes unallocated memory
// | X X X X | * * * X | X X X X |
// begin -> { X | * * * } <- begin + count
VectorX<Scalar> data_in(PacketSize), data_out(PacketSize);
data_in.setRandom();
data_out.setRandom();
Scalar* unaligned_data_in = data_in.data() - 1;
Scalar* unaligned_data_out = data_out.data() - 1;
Index begin = 1;
Index count = PacketSize - 1;
Packet a = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, a, begin, count);
verify_data(unaligned_data_in, unaligned_data_out, a, begin, count);
// test loading the entire packet
data_in.setRandom();
data_out.setRandom();
unaligned_data_in = data_in.data();
unaligned_data_out = data_out.data();
begin = 0;
count = PacketSize;
Packet b = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, b, begin, count);
verify_data(unaligned_data_in, unaligned_data_out, b, begin, count);
// test loading an empty packet segment in unallocated memory
count = 0;
for (begin = 0; begin < PacketSize; begin++) {
data_in.setRandom();
data_out = data_in;
Packet c = internal::ploaduSegment<Packet>(data_in.data(), begin, count);
internal::pstoreuSegment<Scalar, Packet>(data_out.data(), c, begin, count);
// verify that ploaduSegment / pstoreuSegment did nothing
VERIFY_IS_CWISE_EQUAL(data_in, data_out);
}
}
static void test_aligned() {
// test loading a packet segment from aligned memory that includes unallocated memory
// | X X X X | * * * X | X X X X |
// begin -> { * * * X } <- begin + count
VectorX<Scalar> data_in(PacketSize - 1), data_out(PacketSize - 1);
data_in.setRandom();
data_out.setRandom();
Scalar* aligned_data_in = data_in.data();
Scalar* aligned_data_out = data_out.data();
Index begin = 0;
Index count = PacketSize - 1;
Packet b = internal::ploadSegment<Packet>(aligned_data_in, begin, count);
internal::pstoreSegment<Scalar, Packet>(aligned_data_out, b, begin, count);
verify_data(aligned_data_in, aligned_data_out, b, begin, count);
}
static void run() {
test_unaligned();
test_aligned();
}
};
template <typename Scalar, int PacketSize>
struct packet_segment_test_impl<Scalar, PacketSize, false> {
static void run() {}
};
template <typename Scalar, int PacketSize>
struct packet_segment_test_driver {
static void run() {
packet_segment_test_impl<Scalar, PacketSize>::run();
packet_segment_test_driver<Scalar, PacketSize / 2>::run();
}
};
template <typename Scalar>
struct packet_segment_test_driver<Scalar, 1> {
static void run() {}
};
template <typename Scalar>
void test_packet_segment() {
packet_segment_test_driver<Scalar, internal::packet_traits<Scalar>::size>::run();
}
EIGEN_DECLARE_TEST(packet_segment) {
for (int i = 0; i < g_repeat; i++) {
test_packet_segment<bool>();
test_packet_segment<int8_t>();
test_packet_segment<uint8_t>();
test_packet_segment<int16_t>();
test_packet_segment<uint16_t>();
test_packet_segment<int32_t>();
test_packet_segment<uint32_t>();
test_packet_segment<int64_t>();
test_packet_segment<uint64_t>();
test_packet_segment<bfloat16>();
test_packet_segment<half>();
test_packet_segment<float>();
test_packet_segment<double>();
test_packet_segment<std::complex<float>>();
test_packet_segment<std::complex<double>>();
}
}