mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-01 02:53:59 +08:00
masked load/store framework
This commit is contained in:
parent
cebe09110c
commit
28c3b26d53
@ -136,6 +136,8 @@ struct copy_using_evaluator_traits {
|
||||
: Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
|
||||
#endif
|
||||
: NoUnrolling;
|
||||
static constexpr bool UsePacketSegment =
|
||||
enable_packet_segment<Src>::value && enable_packet_segment<Dst>::value && has_packet_segment<PacketType>::value;
|
||||
|
||||
#ifdef EIGEN_DEBUG_ASSIGN
|
||||
static void debug() {
|
||||
@ -273,6 +275,33 @@ struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlign
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
|
||||
struct copy_using_evaluator_innervec_segment {
|
||||
using PacketType = typename Kernel::PacketType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
|
||||
kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
|
||||
Stop - Start);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
|
||||
struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
|
||||
/*UsePacketSegment*/ false>
|
||||
: copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
|
||||
|
||||
template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
|
||||
struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
|
||||
/*UsePacketSegment*/ true> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
|
||||
struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
|
||||
/*UsePacketSegment*/ false> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Part 3 : implementation of all cases
|
||||
***************************************************************************/
|
||||
@ -353,28 +382,48 @@ struct dense_assignment_loop_impl<Kernel, DefaultTraversal, InnerUnrolling> {
|
||||
// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
|
||||
// of the non vectorizable beginning and ending parts
|
||||
|
||||
template <bool IsAligned = false>
|
||||
template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
|
||||
struct unaligned_dense_assignment_loop {
|
||||
// if IsAligned = true, then do nothing
|
||||
// if Skip == true, then do nothing
|
||||
template <typename Kernel>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {}
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*start*/,
|
||||
Index /*end*/) {}
|
||||
template <typename Kernel>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*outer*/,
|
||||
Index /*innerStart*/, Index /*innerEnd*/) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unaligned_dense_assignment_loop<false> {
|
||||
// MSVC must not inline this functions. If it does, it fails to optimize the
|
||||
// packet access path.
|
||||
// FIXME check which version exhibits this issue
|
||||
#if EIGEN_COMP_MSVC
|
||||
template <typename PacketType, int DstAlignment, int SrcAlignment>
|
||||
struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
|
||||
/*Skip*/ false> {
|
||||
template <typename Kernel>
|
||||
static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end)
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) {
|
||||
Index count = end - start;
|
||||
eigen_assert(count <= unpacket_traits<PacketType>::size);
|
||||
if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
|
||||
}
|
||||
template <typename Kernel>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end)
|
||||
#endif
|
||||
{
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index start,
|
||||
Index end) {
|
||||
Index count = end - start;
|
||||
eigen_assert(count <= unpacket_traits<PacketType>::size);
|
||||
if (count > 0)
|
||||
kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename PacketType, int DstAlignment, int SrcAlignment>
|
||||
struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
|
||||
/*Skip*/ false> {
|
||||
template <typename Kernel>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) {
|
||||
for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
|
||||
}
|
||||
template <typename Kernel>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index innerStart,
|
||||
Index innerEnd) {
|
||||
for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Index_, int Stop>
|
||||
@ -395,28 +444,60 @@ struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
|
||||
struct copy_using_evaluator_linearvec_segment {
|
||||
using PacketType = typename Kernel::PacketType;
|
||||
static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
|
||||
static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
|
||||
kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Index_, int Stop>
|
||||
struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
|
||||
: copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
|
||||
|
||||
template <typename Kernel, int Stop>
|
||||
struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
|
||||
};
|
||||
|
||||
template <typename Kernel, int Stop>
|
||||
struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
|
||||
};
|
||||
|
||||
template <typename Kernel>
|
||||
struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling> {
|
||||
using Scalar = typename Kernel::Scalar;
|
||||
using PacketType = typename Kernel::PacketType;
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment;
|
||||
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
|
||||
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
|
||||
static constexpr int DstAlignment =
|
||||
packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
|
||||
static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
|
||||
static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
|
||||
static constexpr bool Alignable =
|
||||
(DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
|
||||
static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
|
||||
static constexpr bool DstIsAligned = DstAlignment >= Alignment;
|
||||
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
|
||||
|
||||
using head_loop =
|
||||
unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
|
||||
using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
|
||||
const Index size = kernel.size();
|
||||
const Index alignedStart = DstIsAligned ? 0 : first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
|
||||
const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
|
||||
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
|
||||
|
||||
unaligned_dense_assignment_loop<DstIsAligned>::run(kernel, 0, alignedStart);
|
||||
head_loop::run(kernel, 0, alignedStart);
|
||||
|
||||
for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
|
||||
kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
|
||||
kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
|
||||
|
||||
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
|
||||
tail_loop::run(kernel, alignedEnd, size);
|
||||
}
|
||||
};
|
||||
|
||||
@ -426,10 +507,11 @@ struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, CompleteUnr
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
|
||||
static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
|
||||
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
|
||||
copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
|
||||
copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, AlignedSize, Size>::run(kernel);
|
||||
copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
|
||||
}
|
||||
};
|
||||
|
||||
@ -505,35 +587,35 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
||||
using Scalar = typename Kernel::Scalar;
|
||||
using PacketType = typename Kernel::PacketType;
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment;
|
||||
static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
|
||||
static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
|
||||
static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
|
||||
static constexpr bool Alignable =
|
||||
packet_traits<Scalar>::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar);
|
||||
static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment;
|
||||
static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment;
|
||||
(DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
|
||||
static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
|
||||
static constexpr bool DstIsAligned = DstAlignment >= Alignment;
|
||||
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
|
||||
|
||||
using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
|
||||
using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
|
||||
const Scalar* dst_ptr = kernel.dstDataPtr();
|
||||
if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
|
||||
// the pointer is not aligned-on scalar, so alignment is not possible
|
||||
return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
|
||||
}
|
||||
const Index innerSize = kernel.innerSize();
|
||||
const Index outerSize = kernel.outerSize();
|
||||
const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
|
||||
Index alignedStart =
|
||||
((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<RequestedAlignment>(dst_ptr, innerSize);
|
||||
Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
|
||||
|
||||
for (Index outer = 0; outer < outerSize; ++outer) {
|
||||
const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
|
||||
// do the non-vectorizable part of the assignment
|
||||
for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
|
||||
|
||||
head_loop::run(kernel, outer, 0, alignedStart);
|
||||
|
||||
// do the vectorizable part of the assignment
|
||||
for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
|
||||
kernel.template assignPacketByOuterInner<DstAlignment, Unaligned, PacketType>(outer, inner);
|
||||
kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
|
||||
|
||||
// do the non-vectorizable part of the assignment
|
||||
for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
|
||||
tail_loop::run(kernel, outer, alignedEnd, innerSize);
|
||||
|
||||
alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
|
||||
}
|
||||
@ -547,11 +629,16 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolli
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
|
||||
static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
|
||||
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
|
||||
|
||||
using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
|
||||
using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
|
||||
Unaligned, UsePacketSegment>;
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
|
||||
for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
|
||||
copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, 0, 0>::run(kernel, outer);
|
||||
copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, VectorizableSize, InnerSize>::run(kernel, outer);
|
||||
packet_loop::run(kernel, outer);
|
||||
packet_segment_loop::run(kernel, outer);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -635,6 +722,27 @@ class generic_dense_assignment_kernel {
|
||||
assignPacket<StoreMode, LoadMode, Packet>(row, col);
|
||||
}
|
||||
|
||||
template <int StoreMode, int LoadMode, typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
|
||||
m_functor.template assignPacketSegment<StoreMode>(
|
||||
&m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
|
||||
count);
|
||||
}
|
||||
|
||||
template <int StoreMode, int LoadMode, typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
|
||||
m_functor.template assignPacketSegment<StoreMode>(
|
||||
&m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, int LoadMode, typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
|
||||
Index count) {
|
||||
Index row = rowIndexByOuterInner(outer, inner);
|
||||
Index col = colIndexByOuterInner(outer, inner);
|
||||
assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
|
||||
typedef typename DstEvaluatorType::ExpressionTraits Traits;
|
||||
return int(Traits::RowsAtCompileTime) == 1 ? 0
|
||||
|
@ -198,19 +198,13 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
|
||||
if (IsRowMajor)
|
||||
return m_d.data[row * m_d.outerStride() + col];
|
||||
else
|
||||
return m_d.data[row + col * m_d.outerStride()];
|
||||
return coeff(getIndex(row, col));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
|
||||
if (IsRowMajor)
|
||||
return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
|
||||
else
|
||||
return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
|
||||
return coeffRef(getIndex(row, col));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
|
||||
@ -219,10 +213,7 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
if (IsRowMajor)
|
||||
return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
|
||||
else
|
||||
return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
|
||||
return packet<LoadMode, PacketType>(getIndex(row, col));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
@ -232,19 +223,43 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
|
||||
if (IsRowMajor)
|
||||
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
|
||||
else
|
||||
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
|
||||
writePacket<StoreMode, PacketType>(getIndex(row, col), x);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
|
||||
return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
|
||||
pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndex(Index row, Index col) const {
|
||||
return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
|
||||
@ -318,6 +333,28 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpos
|
||||
m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
evaluator<ArgType> m_argImpl;
|
||||
};
|
||||
@ -464,10 +501,10 @@ template <typename NullaryOp, typename PlainObjectType>
|
||||
struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
|
||||
: evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
|
||||
typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
|
||||
typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
|
||||
typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
|
||||
CoeffReadCost = functor_traits<NullaryOp>::Cost,
|
||||
|
||||
Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
|
||||
(HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
|
||||
@ -502,9 +539,21 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
|
||||
return m_wrapper.template packetOp<PacketType>(m_functor, index);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType, typename IndexType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
|
||||
Index /*count*/) const {
|
||||
return packet<LoadMode, PacketType, IndexType>(row, col);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType, typename IndexType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
|
||||
Index /*count*/) const {
|
||||
return packet<LoadMode, PacketType, IndexType>(index);
|
||||
}
|
||||
|
||||
protected:
|
||||
const NullaryOp m_functor;
|
||||
const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
|
||||
const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
|
||||
};
|
||||
|
||||
// -------------------- CwiseUnaryOp --------------------
|
||||
@ -546,6 +595,16 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_b
|
||||
return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
|
||||
}
|
||||
|
||||
protected:
|
||||
// this helper permits to completely eliminate the functor if it is empty
|
||||
struct Data {
|
||||
@ -600,16 +659,11 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
|
||||
template <typename DstPacketType>
|
||||
using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
|
||||
|
||||
template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const {
|
||||
return col + packetSize <= cols();
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
|
||||
return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
|
||||
}
|
||||
template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const {
|
||||
return row + packetSize <= rows();
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const {
|
||||
return index + packetSize <= size();
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
|
||||
return index + count + begin <= size();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
|
||||
@ -632,43 +686,86 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
|
||||
template <int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
|
||||
constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
Index actualRow = IsRowMajor ? row : row + (offset * PacketSize);
|
||||
Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col;
|
||||
eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds");
|
||||
Index packetOffset = offset * PacketSize;
|
||||
Index actualRow = IsRowMajor ? row : row + packetOffset;
|
||||
Index actualCol = IsRowMajor ? col + packetOffset : col;
|
||||
eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
|
||||
return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
|
||||
}
|
||||
template <int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
|
||||
constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
Index actualIndex = index + (offset * PacketSize);
|
||||
eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds");
|
||||
Index packetOffset = offset * PacketSize;
|
||||
Index actualIndex = index + packetOffset;
|
||||
eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
|
||||
return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
|
||||
}
|
||||
template <int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
|
||||
Index offset) const {
|
||||
constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
Index packetOffset = offset * PacketSize;
|
||||
Index actualRow = IsRowMajor ? row : row + packetOffset;
|
||||
Index actualCol = IsRowMajor ? col + packetOffset : col;
|
||||
eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds");
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
|
||||
}
|
||||
template <int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
|
||||
Index offset) const {
|
||||
constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
Index packetOffset = offset * PacketSize;
|
||||
Index actualIndex = index + packetOffset + begin;
|
||||
eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds");
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
|
||||
}
|
||||
|
||||
template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
|
||||
Index begin,
|
||||
Index count) const {
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<PacketType, NumPackets> packets;
|
||||
Index offset = begin / SrcPacketSize;
|
||||
Index actualBegin = begin % SrcPacketSize;
|
||||
for (; offset < NumPackets; offset++) {
|
||||
Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
|
||||
packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
|
||||
if (count == actualCount) break;
|
||||
actualBegin = 0;
|
||||
count -= actualCount;
|
||||
}
|
||||
return packets;
|
||||
}
|
||||
template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
|
||||
Index begin,
|
||||
Index count) const {
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<PacketType, NumPackets> packets;
|
||||
Index offset = begin / SrcPacketSize;
|
||||
Index actualBegin = begin % SrcPacketSize;
|
||||
for (; offset < NumPackets; offset++) {
|
||||
Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
|
||||
packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
|
||||
if (count == actualCount) break;
|
||||
actualBegin = 0;
|
||||
count -= actualCount;
|
||||
}
|
||||
return packets;
|
||||
}
|
||||
|
||||
// There is no source packet type with equal or fewer elements than DstPacketType.
|
||||
// This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
|
||||
// For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
|
||||
// The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
|
||||
// is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
|
||||
|
||||
// Instead, perform runtime check to determine if the load would access data outside the bounds of the array.
|
||||
// If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load.
|
||||
// In either case, perform a vectorized cast of the source packet.
|
||||
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
SrcPacketType src;
|
||||
if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) {
|
||||
src = srcPacket<SrcLoadMode>(row, col, 0);
|
||||
} else {
|
||||
Array<SrcType, SrcPacketSize, 1> srcArray;
|
||||
for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k);
|
||||
for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
|
||||
src = pload<SrcPacketType>(srcArray.data());
|
||||
}
|
||||
return pcast<SrcPacketType, DstPacketType>(src);
|
||||
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
|
||||
}
|
||||
// Use the source packet type with the same size as DstPacketType, if it exists
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
|
||||
@ -704,22 +801,67 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
|
||||
srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
|
||||
}
|
||||
|
||||
// packetSegment variants
|
||||
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
|
||||
}
|
||||
// Use the source packet type with the same size as DstPacketType, if it exists
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
return pcast<SizedSrcPacketType, DstPacketType>(
|
||||
srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
constexpr int NumPackets = 2;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
constexpr int NumPackets = 4;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
|
||||
packets.packet[3]);
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
constexpr int NumPackets = 8;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
|
||||
packets.packet[3], packets.packet[4], packets.packet[5],
|
||||
packets.packet[6], packets.packet[7]);
|
||||
}
|
||||
|
||||
// Analogous routines for linear access.
|
||||
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
SrcPacketType src;
|
||||
if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) {
|
||||
src = srcPacket<SrcLoadMode>(index, 0);
|
||||
} else {
|
||||
Array<SrcType, SrcPacketSize, 1> srcArray;
|
||||
for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k);
|
||||
for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
|
||||
src = pload<SrcPacketType>(srcArray.data());
|
||||
}
|
||||
return pcast<SrcPacketType, DstPacketType>(src);
|
||||
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
|
||||
}
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
|
||||
@ -749,6 +891,55 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
|
||||
srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
|
||||
}
|
||||
|
||||
// packetSegment variants
|
||||
template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
|
||||
}
|
||||
// Use the source packet type with the same size as DstPacketType, if it exists
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
|
||||
using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
|
||||
constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
|
||||
return pcast<SizedSrcPacketType, DstPacketType>(
|
||||
srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
constexpr int NumPackets = 2;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
constexpr int NumPackets = 4;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
|
||||
packets.packet[3]);
|
||||
}
|
||||
// unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
|
||||
template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
constexpr int NumPackets = 8;
|
||||
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
|
||||
PacketBlock<SrcPacketType, NumPackets> packets =
|
||||
srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
|
||||
return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
|
||||
packets.packet[3], packets.packet[4], packets.packet[5],
|
||||
packets.packet[6], packets.packet[7]);
|
||||
}
|
||||
|
||||
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
|
||||
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
|
||||
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
|
||||
@ -826,6 +1017,20 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
|
||||
m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
|
||||
m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
|
||||
m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
|
||||
m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
|
||||
m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
|
||||
}
|
||||
|
||||
protected:
|
||||
// this helper permits to completely eliminate the functor if it is empty
|
||||
struct Data {
|
||||
@ -922,6 +1127,18 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
|
||||
m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
|
||||
m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
|
||||
m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
|
||||
}
|
||||
|
||||
protected:
|
||||
// this helper permits to completely eliminate the functor if it is empty
|
||||
struct Data {
|
||||
@ -1013,7 +1230,7 @@ struct mapbase_evaluator : evaluator_base<Derived> {
|
||||
m_innerStride(map.innerStride()),
|
||||
m_outerStride(map.outerStride()) {
|
||||
EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
|
||||
internal::inner_stride_at_compile_time<Derived>::ret == 1),
|
||||
inner_stride_at_compile_time<Derived>::ret == 1),
|
||||
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
|
||||
}
|
||||
@ -1035,23 +1252,47 @@ struct mapbase_evaluator : evaluator_base<Derived> {
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
return internal::ploadt<PacketType, LoadMode>(ptr);
|
||||
return ploadt<PacketType, LoadMode>(ptr);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
|
||||
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
|
||||
return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
|
||||
pstoret<Scalar, PacketType, StoreMode>(ptr, x);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
|
||||
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
|
||||
pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -1063,8 +1304,8 @@ struct mapbase_evaluator : evaluator_base<Derived> {
|
||||
}
|
||||
|
||||
PointerType m_data;
|
||||
const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
|
||||
const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
|
||||
const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
|
||||
const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
|
||||
};
|
||||
|
||||
template <typename PlainObjectType, int MapOptions, typename StrideType>
|
||||
@ -1117,7 +1358,7 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType>>
|
||||
// -------------------- Block --------------------
|
||||
|
||||
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
|
||||
bool HasDirectAccess = internal::has_direct_access<ArgType>::ret>
|
||||
bool HasDirectAccess = has_direct_access<ArgType>::ret>
|
||||
struct block_evaluator;
|
||||
|
||||
template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
|
||||
@ -1246,6 +1487,39 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
x);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
|
||||
begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
|
||||
else
|
||||
return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
|
||||
begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
|
||||
m_startCol.value() + col, x, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
|
||||
count);
|
||||
else
|
||||
return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0, x, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
|
||||
linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
|
||||
@ -1341,8 +1615,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
|
||||
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
|
||||
typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested;
|
||||
typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
|
||||
typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
|
||||
typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
|
||||
@ -1361,19 +1635,15 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
|
||||
// try to avoid using modulo; this is a pure optimization strategy
|
||||
const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
|
||||
: RowFactor == 1 ? row
|
||||
: row % m_rows.value();
|
||||
const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
|
||||
: ColFactor == 1 ? col
|
||||
: col % m_cols.value();
|
||||
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
|
||||
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
|
||||
|
||||
return m_argImpl.coeff(actual_row, actual_col);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
// try to avoid using modulo; this is a pure optimization strategy
|
||||
const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
|
||||
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
|
||||
? (ColFactor == 1 ? index : index % m_cols.value())
|
||||
: (RowFactor == 1 ? index : index % m_rows.value());
|
||||
|
||||
@ -1382,25 +1652,38 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
|
||||
: RowFactor == 1 ? row
|
||||
: row % m_rows.value();
|
||||
const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
|
||||
: ColFactor == 1 ? col
|
||||
: col % m_cols.value();
|
||||
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
|
||||
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
|
||||
|
||||
return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
|
||||
const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
|
||||
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
|
||||
? (ColFactor == 1 ? index : index % m_cols.value())
|
||||
: (RowFactor == 1 ? index : index % m_rows.value());
|
||||
|
||||
return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
|
||||
const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
|
||||
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
|
||||
? (ColFactor == 1 ? index : index % m_cols.value())
|
||||
: (RowFactor == 1 ? index : index % m_rows.value());
|
||||
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
const ArgTypeNested m_arg;
|
||||
evaluator<ArgTypeNestedCleaned> m_argImpl;
|
||||
@ -1457,6 +1740,28 @@ struct evaluator_wrapper_base : evaluator_base<XprType> {
|
||||
m_argImpl.template writePacket<StoreMode>(index, x);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
evaluator<ArgType> m_argImpl;
|
||||
};
|
||||
@ -1536,41 +1841,97 @@ struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<Arg
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
enum {
|
||||
PacketSize = unpacket_traits<PacketType>::size,
|
||||
OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
|
||||
OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
|
||||
};
|
||||
typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
|
||||
return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(
|
||||
ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col));
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
|
||||
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
|
||||
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
|
||||
|
||||
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
|
||||
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
|
||||
|
||||
return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
|
||||
enum { PacketSize = unpacket_traits<PacketType>::size };
|
||||
return preverse(
|
||||
m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
|
||||
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
|
||||
|
||||
return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
|
||||
// FIXME we could factorize some code with packet(i,j)
|
||||
enum {
|
||||
PacketSize = unpacket_traits<PacketType>::size,
|
||||
OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
|
||||
OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
|
||||
};
|
||||
typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
|
||||
m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row,
|
||||
ReverseCol ? m_cols.value() - col - OffsetCol : col,
|
||||
reverse_packet::run(x));
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
|
||||
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
|
||||
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
|
||||
|
||||
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
|
||||
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
|
||||
|
||||
m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
|
||||
enum { PacketSize = unpacket_traits<PacketType>::size };
|
||||
m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
|
||||
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
|
||||
|
||||
m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
|
||||
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
|
||||
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
|
||||
|
||||
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
|
||||
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
|
||||
Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
|
||||
|
||||
return reverse_packet::run(
|
||||
m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
|
||||
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
|
||||
Index actualBegin = PacketSize - count - begin;
|
||||
|
||||
return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
|
||||
static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
|
||||
using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
|
||||
|
||||
Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
|
||||
Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
|
||||
Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
|
||||
|
||||
m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
|
||||
Index count) {
|
||||
static constexpr int PacketSize = unpacket_traits<PacketType>::size;
|
||||
|
||||
Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
|
||||
Index actualBegin = PacketSize - count - begin;
|
||||
|
||||
m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -1621,7 +1982,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType
|
||||
|
||||
protected:
|
||||
evaluator<ArgType> m_argImpl;
|
||||
const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
|
||||
const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const {
|
||||
|
@ -1562,6 +1562,72 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
|
||||
return (Packet)pand(result, peven_mask(result)); // atan2 0 atan2 0 ...
|
||||
}
|
||||
|
||||
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
|
||||
* outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
|
||||
Index count) {
|
||||
using Scalar = typename unpacket_traits<Packet>::type;
|
||||
constexpr Index PacketSize = unpacket_traits<Packet>::size;
|
||||
eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
|
||||
Scalar aux[PacketSize];
|
||||
smart_copy(from + begin, from + begin + count, aux + begin);
|
||||
return ploadu<Packet>(aux);
|
||||
}
|
||||
|
||||
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
|
||||
* outside this range are not defined. \a *from must be aligned, and cannot be null.*/
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
|
||||
Index count) {
|
||||
return ploaduSegment<Packet>(from, begin, count);
|
||||
}
|
||||
|
||||
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
|
||||
Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
|
||||
null if \a count is zero.*/
|
||||
template <typename Scalar, typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
|
||||
constexpr Index PacketSize = unpacket_traits<Packet>::size;
|
||||
eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
|
||||
Scalar aux[PacketSize];
|
||||
pstoreu<Scalar, Packet>(aux, from);
|
||||
smart_copy(aux + begin, aux + begin + count, to + begin);
|
||||
}
|
||||
|
||||
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
|
||||
Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
|
||||
null.*/
|
||||
template <typename Scalar, typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
|
||||
return pstoreuSegment(to, from, begin, count);
|
||||
}
|
||||
|
||||
/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
|
||||
* outside this range are not defined.*/
|
||||
template <typename Packet, int Alignment>
|
||||
EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
|
||||
Index count) {
|
||||
constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
|
||||
if (Alignment >= RequiredAlignment) {
|
||||
return ploadSegment<Packet>(from, begin, count);
|
||||
} else {
|
||||
return ploaduSegment<Packet>(from, begin, count);
|
||||
}
|
||||
}
|
||||
|
||||
/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
|
||||
Elements outside of the range [begin, begin + count) are not defined.*/
|
||||
template <typename Scalar, typename Packet, int Alignment>
|
||||
EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
|
||||
constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
|
||||
if (Alignment >= RequiredAlignment) {
|
||||
pstoreSegment<Scalar, Packet>(to, from, begin, count);
|
||||
} else {
|
||||
pstoreuSegment<Scalar, Packet>(to, from, begin, count);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef EIGEN_NO_IO
|
||||
|
||||
template <typename Packet>
|
||||
|
@ -283,7 +283,7 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, cons
|
||||
template <typename Lhs, typename Rhs>
|
||||
struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
|
||||
template <typename T>
|
||||
struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {};
|
||||
struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
|
||||
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
|
||||
|
||||
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
|
||||
@ -445,7 +445,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductM
|
||||
|
||||
eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
|
||||
blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
|
||||
std::conditional_t<HasScalarFactor, true_type, false_type>());
|
||||
bool_constant<HasScalarFactor>());
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -635,6 +635,24 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
return packet<LoadMode, PacketType>(row, col);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
|
||||
Index count) const {
|
||||
PacketType res;
|
||||
typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
||||
Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
|
||||
PacketImpl;
|
||||
PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
|
||||
const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
|
||||
const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
|
||||
return packetSegment<LoadMode, PacketType>(row, col, begin, count);
|
||||
}
|
||||
|
||||
protected:
|
||||
add_const_on_value_type_t<LhsNested> m_lhs;
|
||||
add_const_on_value_type_t<RhsNested> m_rhs;
|
||||
@ -670,6 +688,13 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
|
||||
rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index innerDim, Packet& res, Index begin, Index count) {
|
||||
etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
|
||||
row, col, lhs, rhs, innerDim, res, begin, count);
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
|
||||
rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
|
||||
}
|
||||
};
|
||||
|
||||
template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -681,6 +706,13 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
|
||||
res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
|
||||
pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index innerDim, Packet& res, Index begin, Index count) {
|
||||
etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
|
||||
row, col, lhs, rhs, innerDim, res, begin, count);
|
||||
res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
|
||||
pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -689,6 +721,12 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
|
||||
Index /*innerDim*/, Packet& res) {
|
||||
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index /*innerDim*/, Packet& res, Index begin,
|
||||
Index count) {
|
||||
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
|
||||
rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -697,6 +735,12 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
|
||||
Index /*innerDim*/, Packet& res) {
|
||||
res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index /*innerDim*/, Packet& res, Index begin,
|
||||
Index count) {
|
||||
res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
|
||||
pset1<Packet>(rhs.coeff(Index(0), col)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -705,6 +749,11 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> {
|
||||
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
|
||||
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
|
||||
Index /*begin*/, Index /*count*/) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -713,6 +762,11 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> {
|
||||
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
|
||||
const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
|
||||
Index /*begin*/, Index /*count*/) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -723,6 +777,13 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
|
||||
for (Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index innerDim, Packet& res, Index begin, Index count) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
for (Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
|
||||
res);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
|
||||
@ -733,6 +794,13 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
|
||||
for (Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
|
||||
Index innerDim, Packet& res, Index begin, Index count) {
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
for (Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
|
||||
res);
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
@ -871,6 +939,26 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
|
||||
m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
|
||||
internal::true_type) const {
|
||||
return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
|
||||
internal::pset1<PacketType>(m_diagImpl.coeff(id)));
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
|
||||
internal::false_type) const {
|
||||
enum {
|
||||
InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
|
||||
DiagonalPacketLoadMode = plain_enum_min(
|
||||
LoadMode,
|
||||
((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
|
||||
};
|
||||
return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
|
||||
m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
|
||||
}
|
||||
|
||||
evaluator<DiagonalType> m_diagImpl;
|
||||
evaluator<MatrixType> m_matImpl;
|
||||
};
|
||||
@ -892,7 +980,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
typedef typename XprType::PlainObject PlainObject;
|
||||
typedef typename Lhs::DiagonalVectorType DiagonalType;
|
||||
|
||||
enum { StorageOrder = Base::StorageOrder_ };
|
||||
static constexpr int StorageOrder = Base::StorageOrder_;
|
||||
using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
|
||||
|
||||
@ -905,8 +994,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
// FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
|
||||
// See also similar calls below.
|
||||
return this->template packet_impl<LoadMode, PacketType>(
|
||||
row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>());
|
||||
return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
@ -914,6 +1002,19 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
|
||||
int(StorageOrder) == ColMajor ? 0 : idx);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
// FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
|
||||
// See also similar calls below.
|
||||
return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
|
||||
return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
|
||||
begin, count);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -933,7 +1034,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
||||
typedef typename XprType::PlainObject PlainObject;
|
||||
|
||||
enum { StorageOrder = Base::StorageOrder_ };
|
||||
static constexpr int StorageOrder = Base::StorageOrder_;
|
||||
using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
|
||||
|
||||
@ -944,14 +1046,23 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
#ifndef EIGEN_GPUCC
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
|
||||
return this->template packet_impl<LoadMode, PacketType>(
|
||||
row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>());
|
||||
return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
|
||||
return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
|
||||
int(StorageOrder) == ColMajor ? 0 : idx);
|
||||
return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
|
||||
return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
|
||||
}
|
||||
|
||||
template <int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
|
||||
return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
|
||||
begin, count);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
@ -65,6 +65,31 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
|
||||
Index col = Base::colIndexByOuterInner(outer, inner);
|
||||
assignPacket<StoreMode, LoadMode, PacketType>(row, col);
|
||||
}
|
||||
|
||||
template <int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
|
||||
PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
|
||||
const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
|
||||
row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
|
||||
m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
|
||||
}
|
||||
|
||||
template <int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
|
||||
PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
|
||||
const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
|
||||
index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
|
||||
m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
|
||||
}
|
||||
|
||||
// TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
|
||||
// mean no CRTP (Gael)
|
||||
template <int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
|
||||
Index row = Base::rowIndexByOuterInner(outer, inner);
|
||||
Index col = Base::colIndexByOuterInner(outer, inner);
|
||||
assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
@ -36,6 +36,10 @@ template <typename MatrixType, typename MemberOp, int Direction>
|
||||
class PartialReduxExpr;
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename ArgType, typename MemberOp, int Direction>
|
||||
struct enable_packet_segment<PartialReduxExpr<ArgType, MemberOp, Direction>> : std::false_type {};
|
||||
|
||||
template <typename MatrixType, typename MemberOp, int Direction>
|
||||
struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
|
||||
typedef typename MemberOp::result_type Scalar;
|
||||
|
@ -507,6 +507,57 @@ EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, con
|
||||
return pnegate(pmadd(a, b, c));
|
||||
}
|
||||
#endif
|
||||
|
||||
/*---------------- load/store segment support ----------------*/
|
||||
|
||||
/*---------------- std::complex<float> ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet2cf> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4cf> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
|
||||
return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
|
||||
Index count) {
|
||||
_mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
|
||||
return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
|
||||
Index count) {
|
||||
_mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
|
||||
}
|
||||
|
||||
/*---------------- std::complex<double> ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet2cd> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
|
||||
return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
|
||||
Index begin, Index count) {
|
||||
_mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
|
||||
}
|
||||
|
||||
/*---------------- end load/store segment support ----------------*/
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -2938,6 +2938,258 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
|
||||
kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
|
||||
}
|
||||
|
||||
/*---------------- load/store segment support ----------------*/
|
||||
|
||||
// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m128i segment_mask_4x8(Index begin, Index count) {
|
||||
eigen_assert(begin >= 0 && begin + count <= 4);
|
||||
long long mask = 1;
|
||||
mask <<= CHAR_BIT * count;
|
||||
mask--;
|
||||
mask <<= CHAR_BIT * begin;
|
||||
#if defined(_WIN32) && !defined(_WIN64)
|
||||
return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
|
||||
#else
|
||||
return _mm_cvtsi64_si128(mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m128i segment_mask_8x8(Index begin, Index count) {
|
||||
eigen_assert(begin >= 0 && begin + count <= 8);
|
||||
long long mask = 1;
|
||||
// avoid UB when count == 8
|
||||
mask <<= (CHAR_BIT / 2) * count;
|
||||
mask <<= (CHAR_BIT / 2) * count;
|
||||
mask--;
|
||||
mask <<= CHAR_BIT * begin;
|
||||
#if defined(_WIN32) && !defined(_WIN64)
|
||||
return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
|
||||
#else
|
||||
return _mm_cvtsi64_si128(mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m128i segment_mask_4x32(Index begin, Index count) {
|
||||
eigen_assert(begin >= 0 && begin + count <= 4);
|
||||
return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
|
||||
}
|
||||
|
||||
// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m128i segment_mask_2x64(Index begin, Index count) {
|
||||
eigen_assert(begin >= 0 && begin + count <= 2);
|
||||
return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
|
||||
}
|
||||
|
||||
// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m256i segment_mask_8x32(Index begin, Index count) {
|
||||
__m128i mask_epi8 = segment_mask_8x8(begin, count);
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
__m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
|
||||
#else
|
||||
__m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
|
||||
__m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
|
||||
__m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
|
||||
#endif
|
||||
return mask_epi32;
|
||||
}
|
||||
|
||||
// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
|
||||
inline __m256i segment_mask_4x64(Index begin, Index count) {
|
||||
__m128i mask_epi8 = segment_mask_4x8(begin, count);
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
__m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
|
||||
#else
|
||||
__m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
|
||||
__m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
|
||||
__m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
|
||||
#endif
|
||||
return mask_epi64;
|
||||
}
|
||||
|
||||
/*---------------- float ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4f> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet8f> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
|
||||
return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
|
||||
_mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
|
||||
return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
|
||||
_mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
|
||||
}
|
||||
|
||||
/*---------------- int32 ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4i> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet8i> : std::true_type {};
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
|
||||
template <>
|
||||
inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
|
||||
return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
|
||||
_mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
|
||||
return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
|
||||
_mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <>
|
||||
inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
|
||||
return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
|
||||
pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
|
||||
return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
|
||||
pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*---------------- uint32 ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4ui> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet8ui> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
|
||||
return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
|
||||
pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
|
||||
return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
|
||||
pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
|
||||
}
|
||||
|
||||
/*---------------- double ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet2d> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4d> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
|
||||
return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
|
||||
_mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
|
||||
return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
|
||||
_mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
|
||||
/*---------------- int64_t ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet2l> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4l> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
|
||||
return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
|
||||
}
|
||||
template <>
|
||||
inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
|
||||
_mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
|
||||
}
|
||||
template <>
|
||||
inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
|
||||
return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
|
||||
}
|
||||
template <>
|
||||
inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
|
||||
_mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
|
||||
}
|
||||
|
||||
/*---------------- uint64_t ----------------*/
|
||||
|
||||
template <>
|
||||
struct has_packet_segment<Packet4ul> : std::true_type {};
|
||||
|
||||
template <>
|
||||
inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
|
||||
return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
|
||||
}
|
||||
template <>
|
||||
inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
|
||||
pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*---------------- end load/store segment support ----------------*/
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -29,6 +29,11 @@ struct assign_op {
|
||||
EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
|
||||
pstoret<DstScalar, Packet, Alignment>(a, b);
|
||||
}
|
||||
|
||||
template <int Alignment, typename Packet>
|
||||
EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
|
||||
pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
|
||||
}
|
||||
};
|
||||
|
||||
// Empty overload for void type (used by PermutationMatrix)
|
||||
@ -60,6 +65,12 @@ struct compound_assign_op {
|
||||
assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
|
||||
a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
|
||||
}
|
||||
|
||||
template <int Alignment, typename Packet>
|
||||
EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
|
||||
assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
|
||||
a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DstScalar, typename SrcScalar, typename Func>
|
||||
|
@ -438,7 +438,6 @@ struct scalar_quotient_op : binary_op_base<LhsScalar, RhsScalar> {
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
|
||||
maybe_raise_div_by_zero<Packet>::run(b);
|
||||
return internal::pdiv(a, b);
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ struct scalar_constant_op {
|
||||
const Scalar m_other;
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_constant_op<Scalar> > {
|
||||
struct functor_traits<scalar_constant_op<Scalar>> {
|
||||
enum {
|
||||
Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
|
||||
PacketAccess = packet_traits<Scalar>::Vectorizable,
|
||||
@ -56,7 +56,7 @@ struct scalar_identity_op {
|
||||
}
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_identity_op<Scalar> > {
|
||||
struct functor_traits<scalar_identity_op<Scalar>> {
|
||||
enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
|
||||
};
|
||||
|
||||
@ -86,18 +86,19 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
|
||||
// Principle:
|
||||
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
|
||||
Packet low = pset1<Packet>(m_low);
|
||||
Packet high = pset1<Packet>(m_high);
|
||||
Packet step = pset1<Packet>(m_step);
|
||||
if (m_flip) {
|
||||
Packet pi = plset<Packet>(Scalar(i - m_size1));
|
||||
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
|
||||
if (EIGEN_PREDICT_TRUE(i != 0)) return res;
|
||||
Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
|
||||
return pselect<Packet>(mask, res, pset1<Packet>(m_low));
|
||||
Packet res = pmadd(step, pi, high);
|
||||
Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
|
||||
return pselect<Packet>(mask, res, low);
|
||||
} else {
|
||||
Packet pi = plset<Packet>(Scalar(i));
|
||||
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
|
||||
if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res;
|
||||
Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1));
|
||||
return pselect<Packet>(mask, res, pset1<Packet>(m_high));
|
||||
Packet res = pmadd(step, pi, low);
|
||||
Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
|
||||
return pselect<Packet>(mask, res, high);
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,7 +140,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
|
||||
template <typename Scalar>
|
||||
struct linspaced_op;
|
||||
template <typename Scalar>
|
||||
struct functor_traits<linspaced_op<Scalar> > {
|
||||
struct functor_traits<linspaced_op<Scalar>> {
|
||||
enum {
|
||||
Cost = 1,
|
||||
PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
|
||||
@ -192,7 +193,7 @@ struct equalspaced_op {
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<equalspaced_op<Scalar> > {
|
||||
struct functor_traits<equalspaced_op<Scalar>> {
|
||||
enum {
|
||||
Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
|
||||
PacketAccess =
|
||||
|
@ -514,6 +514,12 @@ template <typename Xpr>
|
||||
struct eigen_memset_helper;
|
||||
template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
|
||||
struct eigen_zero_impl;
|
||||
|
||||
template <typename Packet>
|
||||
struct has_packet_segment : std::false_type {};
|
||||
|
||||
template <typename Xpr>
|
||||
struct enable_packet_segment : std::true_type {};
|
||||
} // namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -90,12 +90,8 @@ namespace internal {
|
||||
* we however don't want to add a dependency to Boost.
|
||||
*/
|
||||
|
||||
struct true_type {
|
||||
enum { value = 1 };
|
||||
};
|
||||
struct false_type {
|
||||
enum { value = 0 };
|
||||
};
|
||||
using std::false_type;
|
||||
using std::true_type;
|
||||
|
||||
template <bool Condition>
|
||||
struct bool_constant;
|
||||
|
@ -996,6 +996,27 @@ struct is_matrix_base_xpr : std::is_base_of<MatrixBase<remove_all_t<XprType>>, r
|
||||
template <typename XprType>
|
||||
struct is_permutation_base_xpr : std::is_base_of<PermutationBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
|
||||
|
||||
/*---------------- load/store segment support ----------------*/
|
||||
|
||||
// recursively traverse unary, binary, and ternary expressions to determine if packet segments are supported
|
||||
|
||||
template <typename Func, typename Xpr>
|
||||
struct enable_packet_segment<CwiseNullaryOp<Func, Xpr>> : enable_packet_segment<remove_all_t<Xpr>> {};
|
||||
|
||||
template <typename Func, typename Xpr>
|
||||
struct enable_packet_segment<CwiseUnaryOp<Func, Xpr>> : enable_packet_segment<remove_all_t<Xpr>> {};
|
||||
|
||||
template <typename Func, typename LhsXpr, typename RhsXpr>
|
||||
struct enable_packet_segment<CwiseBinaryOp<Func, LhsXpr, RhsXpr>>
|
||||
: bool_constant<enable_packet_segment<remove_all_t<LhsXpr>>::value &&
|
||||
enable_packet_segment<remove_all_t<RhsXpr>>::value> {};
|
||||
|
||||
template <typename Func, typename LhsXpr, typename MidXpr, typename RhsXpr>
|
||||
struct enable_packet_segment<CwiseTernaryOp<Func, LhsXpr, MidXpr, RhsXpr>>
|
||||
: bool_constant<enable_packet_segment<remove_all_t<LhsXpr>>::value &&
|
||||
enable_packet_segment<remove_all_t<MidXpr>>::value &&
|
||||
enable_packet_segment<remove_all_t<RhsXpr>>::value> {};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \class ScalarBinaryOpTraits
|
||||
|
@ -308,19 +308,24 @@ struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVec
|
||||
this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
|
||||
}
|
||||
};
|
||||
static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
|
||||
using head_loop =
|
||||
unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
|
||||
using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
|
||||
const Index size = kernel.size();
|
||||
const Index alignedStart =
|
||||
DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
|
||||
const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
|
||||
|
||||
unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);
|
||||
head_loop::run(kernel, 0, alignedStart);
|
||||
|
||||
constexpr float cost = static_cast<float>(XprEvaluationCost);
|
||||
AssignmentFunctor functor(kernel);
|
||||
device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
|
||||
|
||||
unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
|
||||
tail_loop::run(kernel, alignedEnd, size);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -186,6 +186,7 @@ ei_add_test(mixingtypes)
|
||||
ei_add_test(float_conversion)
|
||||
ei_add_test(io)
|
||||
ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
|
||||
ei_add_test(packet_segment)
|
||||
ei_add_test(vectorization_logic)
|
||||
ei_add_test(basicstuff)
|
||||
ei_add_test(constexpr)
|
||||
|
168
test/packet_segment.cpp
Normal file
168
test/packet_segment.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 The Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#include "main.h"
|
||||
|
||||
template <typename Scalar, typename Packet>
|
||||
void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) {
|
||||
constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
|
||||
bool ok = true;
|
||||
for (Index i = begin; i < begin + count; i++) {
|
||||
ok = ok && numext::equal_strict(data_in[i], data_out[i]);
|
||||
}
|
||||
if (!ok) {
|
||||
std::cout << "begin: " << begin << ", count: " << count << "\n";
|
||||
std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n";
|
||||
std::cout << "data in: {";
|
||||
for (Index i = 0; i < PacketSize; i++) {
|
||||
if (i > 0) std::cout << ",";
|
||||
if (i < begin || i >= begin + count) {
|
||||
std::cout << "MASK";
|
||||
} else {
|
||||
std::cout << data_in[i];
|
||||
}
|
||||
}
|
||||
std::cout << "}\n";
|
||||
std::cout << "data out: {";
|
||||
for (Index i = 0; i < PacketSize; i++) {
|
||||
if (i > 0) std::cout << ",";
|
||||
if (i < begin || i >= begin + count) {
|
||||
std::cout << "MASK";
|
||||
} else {
|
||||
std::cout << data_out[i];
|
||||
}
|
||||
}
|
||||
std::cout << "}\n";
|
||||
std::cout << "packet: ";
|
||||
std::cout << internal::postream(a) << "\n";
|
||||
}
|
||||
VERIFY(ok);
|
||||
}
|
||||
|
||||
template <typename Scalar, int PacketSize, bool Run = internal::find_packet_by_size<Scalar, PacketSize>::value>
|
||||
struct packet_segment_test_impl {
|
||||
using Packet = typename internal::find_packet_by_size<Scalar, PacketSize>::type;
|
||||
static void test_unaligned() {
|
||||
// test loading a packet segment from unaligned memory that includes unallocated memory
|
||||
|
||||
// | X X X X | * * * X | X X X X |
|
||||
// begin -> { X | * * * } <- begin + count
|
||||
|
||||
VectorX<Scalar> data_in(PacketSize), data_out(PacketSize);
|
||||
data_in.setRandom();
|
||||
data_out.setRandom();
|
||||
|
||||
Scalar* unaligned_data_in = data_in.data() - 1;
|
||||
Scalar* unaligned_data_out = data_out.data() - 1;
|
||||
|
||||
Index begin = 1;
|
||||
Index count = PacketSize - 1;
|
||||
|
||||
Packet a = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
|
||||
internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, a, begin, count);
|
||||
|
||||
verify_data(unaligned_data_in, unaligned_data_out, a, begin, count);
|
||||
|
||||
// test loading the entire packet
|
||||
|
||||
data_in.setRandom();
|
||||
data_out.setRandom();
|
||||
|
||||
unaligned_data_in = data_in.data();
|
||||
unaligned_data_out = data_out.data();
|
||||
|
||||
begin = 0;
|
||||
count = PacketSize;
|
||||
|
||||
Packet b = internal::ploaduSegment<Packet>(unaligned_data_in, begin, count);
|
||||
internal::pstoreuSegment<Scalar, Packet>(unaligned_data_out, b, begin, count);
|
||||
|
||||
verify_data(unaligned_data_in, unaligned_data_out, b, begin, count);
|
||||
|
||||
// test loading an empty packet segment in unallocated memory
|
||||
count = 0;
|
||||
|
||||
for (begin = 0; begin < PacketSize; begin++) {
|
||||
data_in.setRandom();
|
||||
data_out = data_in;
|
||||
Packet c = internal::ploaduSegment<Packet>(data_in.data(), begin, count);
|
||||
internal::pstoreuSegment<Scalar, Packet>(data_out.data(), c, begin, count);
|
||||
// verify that ploaduSegment / pstoreuSegment did nothing
|
||||
VERIFY_IS_CWISE_EQUAL(data_in, data_out);
|
||||
}
|
||||
}
|
||||
static void test_aligned() {
|
||||
// test loading a packet segment from aligned memory that includes unallocated memory
|
||||
|
||||
// | X X X X | * * * X | X X X X |
|
||||
// begin -> { * * * X } <- begin + count
|
||||
|
||||
VectorX<Scalar> data_in(PacketSize - 1), data_out(PacketSize - 1);
|
||||
data_in.setRandom();
|
||||
data_out.setRandom();
|
||||
|
||||
Scalar* aligned_data_in = data_in.data();
|
||||
Scalar* aligned_data_out = data_out.data();
|
||||
|
||||
Index begin = 0;
|
||||
Index count = PacketSize - 1;
|
||||
|
||||
Packet b = internal::ploadSegment<Packet>(aligned_data_in, begin, count);
|
||||
internal::pstoreSegment<Scalar, Packet>(aligned_data_out, b, begin, count);
|
||||
|
||||
verify_data(aligned_data_in, aligned_data_out, b, begin, count);
|
||||
}
|
||||
static void run() {
|
||||
test_unaligned();
|
||||
test_aligned();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, int PacketSize>
|
||||
struct packet_segment_test_impl<Scalar, PacketSize, false> {
|
||||
static void run() {}
|
||||
};
|
||||
|
||||
template <typename Scalar, int PacketSize>
|
||||
struct packet_segment_test_driver {
|
||||
static void run() {
|
||||
packet_segment_test_impl<Scalar, PacketSize>::run();
|
||||
packet_segment_test_driver<Scalar, PacketSize / 2>::run();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct packet_segment_test_driver<Scalar, 1> {
|
||||
static void run() {}
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
void test_packet_segment() {
|
||||
packet_segment_test_driver<Scalar, internal::packet_traits<Scalar>::size>::run();
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(packet_segment) {
|
||||
for (int i = 0; i < g_repeat; i++) {
|
||||
test_packet_segment<bool>();
|
||||
test_packet_segment<int8_t>();
|
||||
test_packet_segment<uint8_t>();
|
||||
test_packet_segment<int16_t>();
|
||||
test_packet_segment<uint16_t>();
|
||||
test_packet_segment<int32_t>();
|
||||
test_packet_segment<uint32_t>();
|
||||
test_packet_segment<int64_t>();
|
||||
test_packet_segment<uint64_t>();
|
||||
test_packet_segment<bfloat16>();
|
||||
test_packet_segment<half>();
|
||||
test_packet_segment<float>();
|
||||
test_packet_segment<double>();
|
||||
test_packet_segment<std::complex<float>>();
|
||||
test_packet_segment<std::complex<double>>();
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user