diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 0ea1bc3c0..a33a21a2b 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -136,6 +136,8 @@ struct copy_using_evaluator_traits { : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling) #endif : NoUnrolling; + static constexpr bool UsePacketSegment = + enable_packet_segment::value && enable_packet_segment::value && has_packet_segment::value; #ifdef EIGEN_DEBUG_ASSIGN static void debug() { @@ -273,6 +275,33 @@ struct copy_using_evaluator_innervec_InnerUnrolling +struct copy_using_evaluator_innervec_segment { + using PacketType = typename Kernel::PacketType; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) { + kernel.template assignPacketSegmentByOuterInner(outer, Start, 0, + Stop - Start); + } +}; + +template +struct copy_using_evaluator_innervec_segment + : copy_using_evaluator_DefaultTraversal_InnerUnrolling {}; + +template +struct copy_using_evaluator_innervec_segment { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {} +}; + +template +struct copy_using_evaluator_innervec_segment { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {} +}; + /*************************************************************************** * Part 3 : implementation of all cases ***************************************************************************/ @@ -353,28 +382,48 @@ struct dense_assignment_loop_impl { // The goal of unaligned_dense_assignment_loop is simply to factorize the handling // of the non vectorizable beginning and ending parts -template +template struct unaligned_dense_assignment_loop { - // if IsAligned = true, then do nothing + // if Skip == true, then do nothing template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*start*/, + Index /*end*/) {} + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& /*kernel*/, Index /*outer*/, + Index /*innerStart*/, Index /*innerEnd*/) {} }; -template <> -struct unaligned_dense_assignment_loop { - // MSVC must not inline this functions. If it does, it fails to optimize the - // packet access path. - // FIXME check which version exhibits this issue -#if EIGEN_COMP_MSVC +template +struct unaligned_dense_assignment_loop { template - static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end) -#else + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) { + Index count = end - start; + eigen_assert(count <= unpacket_traits::size); + if (count > 0) kernel.template assignPacketSegment(start, 0, count); + } template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) -#endif - { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index start, + Index end) { + Index count = end - start; + eigen_assert(count <= unpacket_traits::size); + if (count > 0) + kernel.template assignPacketSegmentByOuterInner(outer, start, 0, count); + } +}; + +template +struct unaligned_dense_assignment_loop { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) { for (Index index = start; index < end; ++index) kernel.assignCoeff(index); } + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index outer, Index innerStart, + Index innerEnd) { + for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + } }; template @@ -395,28 +444,60 @@ struct copy_using_evaluator_linearvec_CompleteUnrolling { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} }; +template +struct copy_using_evaluator_linearvec_segment { + using PacketType = typename Kernel::PacketType; + static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment; + static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { + kernel.template assignPacketSegment(Index_, 0, Stop - Index_); + } +}; + +template +struct copy_using_evaluator_linearvec_segment + : copy_using_evaluator_LinearTraversal_CompleteUnrolling {}; + +template +struct copy_using_evaluator_linearvec_segment { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} +}; + +template +struct copy_using_evaluator_linearvec_segment { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} +}; + template struct dense_assignment_loop_impl { using Scalar = typename Kernel::Scalar; using PacketType = typename Kernel::PacketType; static constexpr int PacketSize = unpacket_traits::size; - static constexpr int RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment; - static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment; static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; - static constexpr int DstAlignment = - packet_traits::AlignedOnScalar ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment; + static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar)); + static constexpr int RequestedAlignment = unpacket_traits::alignment; + static constexpr bool Alignable = + (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0); + static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment; + static constexpr bool DstIsAligned = DstAlignment >= Alignment; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; + + using head_loop = + unaligned_dense_assignment_loop; + using tail_loop = unaligned_dense_assignment_loop; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { const Index size = kernel.size(); - const Index alignedStart = DstIsAligned ? 0 : first_aligned(kernel.dstDataPtr(), size); + const Index alignedStart = DstIsAligned ? 0 : first_aligned(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); - unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); + head_loop::run(kernel, 0, alignedStart); for (Index index = alignedStart; index < alignedEnd; index += PacketSize) - kernel.template assignPacket(index); + kernel.template assignPacket(index); - unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + tail_loop::run(kernel, alignedEnd, size); } }; @@ -426,10 +507,11 @@ struct dense_assignment_loop_impl::size; static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime; static constexpr int AlignedSize = numext::round_down(Size, PacketSize); + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { copy_using_evaluator_linearvec_CompleteUnrolling::run(kernel); - copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); + copy_using_evaluator_linearvec_segment::run(kernel); } }; @@ -505,35 +587,35 @@ struct dense_assignment_loop_impl using Scalar = typename Kernel::Scalar; using PacketType = typename Kernel::PacketType; static constexpr int PacketSize = unpacket_traits::size; - static constexpr int RequestedAlignment = Kernel::AssignmentTraits::InnerRequiredAlignment; + static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; + static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar)); + static constexpr int RequestedAlignment = unpacket_traits::alignment; static constexpr bool Alignable = - packet_traits::AlignedOnScalar || Kernel::AssignmentTraits::DstAlignment >= sizeof(Scalar); - static constexpr bool DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment; - static constexpr int DstAlignment = Alignable ? RequestedAlignment : Kernel::AssignmentTraits::DstAlignment; + (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0); + static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment; + static constexpr bool DstIsAligned = DstAlignment >= Alignment; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; + + using head_loop = unaligned_dense_assignment_loop; + using tail_loop = unaligned_dense_assignment_loop; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { const Scalar* dst_ptr = kernel.dstDataPtr(); - if ((!DstIsAligned) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) { - // the pointer is not aligned-on scalar, so alignment is not possible - return dense_assignment_loop::run(kernel); - } const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0; - Index alignedStart = - ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned(dst_ptr, innerSize); + Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned(dst_ptr, innerSize); for (Index outer = 0; outer < outerSize; ++outer) { const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize); - // do the non-vectorizable part of the assignment - for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + + head_loop::run(kernel, outer, 0, alignedStart); // do the vectorizable part of the assignment for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize) - kernel.template assignPacketByOuterInner(outer, inner); + kernel.template assignPacketByOuterInner(outer, inner); - // do the non-vectorizable part of the assignment - for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner); + tail_loop::run(kernel, outer, alignedEnd, innerSize); alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize); } @@ -547,11 +629,16 @@ struct dense_assignment_loop_impl::size; static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime; static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize); + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; + + using packet_loop = copy_using_evaluator_innervec_InnerUnrolling; + using packet_segment_loop = copy_using_evaluator_innervec_segment; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { for (Index outer = 0; outer < kernel.outerSize(); ++outer) { - copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); - copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + packet_loop::run(kernel, outer); + packet_segment_loop::run(kernel, outer); } } }; @@ -635,6 +722,27 @@ class generic_dense_assignment_kernel { assignPacket(row, col); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) { + m_functor.template assignPacketSegment( + &m_dst.coeffRef(row, col), m_src.template packetSegment(row, col, begin, count), begin, + count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) { + m_functor.template assignPacketSegment( + &m_dst.coeffRef(index), m_src.template packetSegment(index, begin, count), begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, + Index count) { + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); + assignPacketSegment(row, col, begin, count); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::RowsAtCompileTime) == 1 ? 0 diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 156ca2b60..87eb65bda 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -198,19 +198,13 @@ struct evaluator> : evaluator_base { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const { - if (IsRowMajor) - return m_d.data[row * m_d.outerStride() + col]; - else - return m_d.data[row + col * m_d.outerStride()]; + return coeff(getIndex(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) { - if (IsRowMajor) - return const_cast(m_d.data)[row * m_d.outerStride() + col]; - else - return const_cast(m_d.data)[row + col * m_d.outerStride()]; + return coeffRef(getIndex(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { @@ -219,10 +213,7 @@ struct evaluator> : evaluator_base { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - if (IsRowMajor) - return ploadt(m_d.data + row * m_d.outerStride() + col); - else - return ploadt(m_d.data + row + col * m_d.outerStride()); + return packet(getIndex(row, col)); } template @@ -232,19 +223,43 @@ struct evaluator> : evaluator_base { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { - if (IsRowMajor) - return pstoret(const_cast(m_d.data) + row * m_d.outerStride() + col, x); - else - return pstoret(const_cast(m_d.data) + row + col * m_d.outerStride(), x); + writePacket(getIndex(row, col), x); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret(const_cast(m_d.data) + index, x); + pstoret(const_cast(m_d.data) + index, x); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return packetSegment(getIndex(row, col), begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return ploadtSegment(m_d.data + index, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + writePacketSegment(getIndex(row, col), x, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + pstoretSegment(const_cast(m_d.data) + index, x, begin, count); } protected: plainobjectbase_evaluator_data m_d; + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndex(Index row, Index col) const { + return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride(); + } }; template @@ -318,6 +333,28 @@ struct unary_evaluator, IndexBased> : evaluator_base(index, x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment(col, row, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_argImpl.template packetSegment(index, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment(col, row, x, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment(index, x, begin, count); + } + protected: evaluator m_argImpl; }; @@ -464,10 +501,10 @@ template struct evaluator> : evaluator_base> { typedef CwiseNullaryOp XprType; - typedef internal::remove_all_t PlainObjectTypeCleaned; + typedef remove_all_t PlainObjectTypeCleaned; enum { - CoeffReadCost = internal::functor_traits::Cost, + CoeffReadCost = functor_traits::Cost, Flags = (evaluator::Flags & (HereditaryBits | (functor_has_linear_access::ret ? LinearAccessBit : 0) | @@ -502,9 +539,21 @@ struct evaluator> return m_wrapper.template packetOp(m_functor, index); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/, + Index /*count*/) const { + return packet(row, col); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/, + Index /*count*/) const { + return packet(index); + } + protected: const NullaryOp m_functor; - const internal::nullary_wrapper m_wrapper; + const nullary_wrapper m_wrapper; }; // -------------------- CwiseUnaryOp -------------------- @@ -546,6 +595,16 @@ struct unary_evaluator, IndexBased> : evaluator_b return m_d.func().packetOp(m_d.argImpl.template packet(index)); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.argImpl.template packetSegment(row, col, begin, count)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.argImpl.template packetSegment(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -600,16 +659,11 @@ struct unary_evaluator, ArgType>, In template using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits::size) == (8 * SrcPacketSize), bool>; - template = true> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const { - return col + packetSize <= cols(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const { + return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows()); } - template = true> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const { - return row + packetSize <= rows(); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const { - return index + packetSize <= size(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const { + return index + count + begin <= size(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const { @@ -632,43 +686,86 @@ struct unary_evaluator, ArgType>, In template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const { constexpr int PacketSize = unpacket_traits::size; - Index actualRow = IsRowMajor ? row : row + (offset * PacketSize); - Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col; - eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds"); + Index packetOffset = offset * PacketSize; + Index actualRow = IsRowMajor ? row : row + packetOffset; + Index actualCol = IsRowMajor ? col + packetOffset : col; + eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds"); return m_argImpl.template packet(actualRow, actualCol); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const { constexpr int PacketSize = unpacket_traits::size; - Index actualIndex = index + (offset * PacketSize); - eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds"); + Index packetOffset = offset * PacketSize; + Index actualIndex = index + packetOffset; + eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds"); return m_argImpl.template packet(actualIndex); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count, + Index offset) const { + constexpr int PacketSize = unpacket_traits::size; + Index packetOffset = offset * PacketSize; + Index actualRow = IsRowMajor ? row : row + packetOffset; + Index actualCol = IsRowMajor ? col + packetOffset : col; + eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds"); + return m_argImpl.template packetSegment(actualRow, actualCol, begin, count); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count, + Index offset) const { + constexpr int PacketSize = unpacket_traits::size; + Index packetOffset = offset * PacketSize; + Index actualIndex = index + packetOffset + begin; + eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds"); + return m_argImpl.template packetSegment(actualIndex, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock srcPacketSegmentHelper(Index row, Index col, + Index begin, + Index count) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets; + Index offset = begin / SrcPacketSize; + Index actualBegin = begin % SrcPacketSize; + for (; offset < NumPackets; offset++) { + Index actualCount = numext::mini(SrcPacketSize - actualBegin, count); + packets.packet[offset] = srcPacketSegment(row, col, actualBegin, actualCount, offset); + if (count == actualCount) break; + actualBegin = 0; + count -= actualCount; + } + return packets; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock srcPacketSegmentHelper(Index index, + Index begin, + Index count) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets; + Index offset = begin / SrcPacketSize; + Index actualBegin = begin % SrcPacketSize; + for (; offset < NumPackets; offset++) { + Index actualCount = numext::mini(SrcPacketSize - actualBegin, count); + packets.packet[offset] = srcPacketSegment(index, actualBegin, actualCount, offset); + if (count == actualCount) break; + actualBegin = 0; + count -= actualCount; + } + return packets; + } // There is no source packet type with equal or fewer elements than DstPacketType. // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array. // For example, consider the cast utilizing pcast with an array of size 4: {0.0f,1.0f,2.0f,3.0f}. // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array. - - // Instead, perform runtime check to determine if the load would access data outside the bounds of the array. - // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load. - // In either case, perform a vectorized cast of the source packet. template = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const { constexpr int DstPacketSize = unpacket_traits::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) { - src = srcPacket(row, col, 0); - } else { - Array srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload(srcArray.data()); - } - return pcast(src); + return pcast(srcPacketSegment(row, col, 0, DstPacketSize, 0)); } // Use the source packet type with the same size as DstPacketType, if it exists template = true> @@ -704,22 +801,67 @@ struct unary_evaluator, ArgType>, In srcPacket(row, col, 6), srcPacket(row, col, 7)); } + // packetSegment variants + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int DstPacketSize = unpacket_traits::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast(srcPacketSegment(row, col, begin, count, 0)); + } + // Use the source packet type with the same size as DstPacketType, if it exists + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int DstPacketSize = unpacket_traits::size; + using SizedSrcPacketType = typename find_packet_by_size::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast( + srcPacketSegment(row, col, begin, count, 0)); + } + // unpacket_traits::size == 2 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 2; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(row, col, begin, count); + return pcast(packets.packet[0], packets.packet[1]); + } + // unpacket_traits::size == 4 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 4; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(row, col, begin, count); + return pcast(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3]); + } + // unpacket_traits::size == 8 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + constexpr int NumPackets = 8; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(row, col, begin, count); + return pcast(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3], packets.packet[4], packets.packet[5], + packets.packet[6], packets.packet[7]); + } + // Analogous routines for linear access. template = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { constexpr int DstPacketSize = unpacket_traits::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) { - src = srcPacket(index, 0); - } else { - Array srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload(srcArray.data()); - } - return pcast(src); + return pcast(srcPacketSegment(index, 0, DstPacketSize, 0)); } template = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { @@ -749,6 +891,55 @@ struct unary_evaluator, ArgType>, In srcPacket(index, 6), srcPacket(index, 7)); } + // packetSegment variants + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int DstPacketSize = unpacket_traits::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast(srcPacketSegment(index, begin, count, 0)); + } + // Use the source packet type with the same size as DstPacketType, if it exists + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int DstPacketSize = unpacket_traits::size; + using SizedSrcPacketType = typename find_packet_by_size::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast( + srcPacketSegment(index, begin, count, 0)); + } + // unpacket_traits::size == 2 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 2; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(index, begin, count); + return pcast(packets.packet[0], packets.packet[1]); + } + // unpacket_traits::size == 4 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 4; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(index, begin, count); + return pcast(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3]); + } + // unpacket_traits::size == 8 * SrcPacketSize + template = true> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const { + constexpr int NumPackets = 8; + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + PacketBlock packets = + srcPacketSegmentHelper(index, begin, count); + return pcast(packets.packet[0], packets.packet[1], packets.packet[2], + packets.packet[3], packets.packet[4], packets.packet[5], + packets.packet[6], packets.packet[7]); + } + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; } @@ -826,6 +1017,20 @@ struct ternary_evaluator, IndexBased m_d.arg3Impl.template packet(index)); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.arg1Impl.template packetSegment(row, col, begin, count), + m_d.arg2Impl.template packetSegment(row, col, begin, count), + m_d.arg3Impl.template packetSegment(row, col, begin, count)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.arg1Impl.template packetSegment(index, begin, count), + m_d.arg2Impl.template packetSegment(index, begin, count), + m_d.arg3Impl.template packetSegment(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -922,6 +1127,18 @@ struct binary_evaluator, IndexBased, IndexBase m_d.rhsImpl.template packet(index)); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_d.func().packetOp(m_d.lhsImpl.template packetSegment(row, col, begin, count), + m_d.rhsImpl.template packetSegment(row, col, begin, count)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_d.func().packetOp(m_d.lhsImpl.template packetSegment(index, begin, count), + m_d.rhsImpl.template packetSegment(index, begin, count)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -1013,7 +1230,7 @@ struct mapbase_evaluator : evaluator_base { m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) { EIGEN_STATIC_ASSERT(check_implication((evaluator::Flags & PacketAccessBit) != 0, - internal::inner_stride_at_compile_time::ret == 1), + inner_stride_at_compile_time::ret == 1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1035,23 +1252,47 @@ struct mapbase_evaluator : evaluator_base { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { PointerType ptr = m_data + row * rowStride() + col * colStride(); - return internal::ploadt(ptr); + return ploadt(ptr); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return internal::ploadt(m_data + index * m_innerStride.value()); + return ploadt(m_data + index * m_innerStride.value()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { PointerType ptr = m_data + row * rowStride() + col * colStride(); - return internal::pstoret(ptr, x); + pstoret(ptr, x); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - internal::pstoret(m_data + index * m_innerStride.value(), x); + pstoret(m_data + index * m_innerStride.value(), x); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return ploadtSegment(ptr, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return ploadtSegment(m_data + index * m_innerStride.value(), begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + pstoretSegment(ptr, x, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + pstoretSegment(m_data + index * m_innerStride.value(), x, begin, count); } protected: @@ -1063,8 +1304,8 @@ struct mapbase_evaluator : evaluator_base { } PointerType m_data; - const internal::variable_if_dynamic m_innerStride; - const internal::variable_if_dynamic m_outerStride; + const variable_if_dynamic m_innerStride; + const variable_if_dynamic m_outerStride; }; template @@ -1117,7 +1358,7 @@ struct evaluator> // -------------------- Block -------------------- template ::ret> + bool HasDirectAccess = has_direct_access::ret> struct block_evaluator; template @@ -1246,6 +1487,39 @@ struct unary_evaluator, IndexBa x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment(m_startRow.value() + row, m_startCol.value() + col, + begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + if (ForwardLinearAccess) + return m_argImpl.template packetSegment(m_linear_offset.value() + index, begin, count); + else + return packetSegment(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0, + begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + return m_argImpl.template writePacketSegment(m_startRow.value() + row, + m_startCol.value() + col, x, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + if (ForwardLinearAccess) + return m_argImpl.template writePacketSegment(m_linear_offset.value() + index, x, begin, + count); + else + return writePacketSegment(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, x, begin, count); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const { @@ -1341,8 +1615,8 @@ struct unary_evaluator> typedef Replicate XprType; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor }; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef internal::remove_all_t ArgTypeNestedCleaned; + typedef typename nested_eval::type ArgTypeNested; + typedef remove_all_t ArgTypeNestedCleaned; enum { CoeffReadCost = evaluator::CoeffReadCost, @@ -1361,19 +1635,15 @@ struct unary_evaluator> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { // try to avoid using modulo; this is a pure optimization strategy - const Index actual_row = internal::traits::RowsAtCompileTime == 1 ? 0 - : RowFactor == 1 ? row - : row % m_rows.value(); - const Index actual_col = internal::traits::ColsAtCompileTime == 1 ? 0 - : ColFactor == 1 ? col - : col % m_cols.value(); + const Index actual_row = traits::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); return m_argImpl.coeff(actual_row, actual_col); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { // try to avoid using modulo; this is a pure optimization strategy - const Index actual_index = internal::traits::RowsAtCompileTime == 1 + const Index actual_index = traits::RowsAtCompileTime == 1 ? (ColFactor == 1 ? index : index % m_cols.value()) : (RowFactor == 1 ? index : index % m_rows.value()); @@ -1382,25 +1652,38 @@ struct unary_evaluator> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - const Index actual_row = internal::traits::RowsAtCompileTime == 1 ? 0 - : RowFactor == 1 ? row - : row % m_rows.value(); - const Index actual_col = internal::traits::ColsAtCompileTime == 1 ? 0 - : ColFactor == 1 ? col - : col % m_cols.value(); + const Index actual_row = traits::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); return m_argImpl.template packet(actual_row, actual_col); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - const Index actual_index = internal::traits::RowsAtCompileTime == 1 + const Index actual_index = traits::RowsAtCompileTime == 1 ? (ColFactor == 1 ? index : index % m_cols.value()) : (RowFactor == 1 ? index : index % m_rows.value()); return m_argImpl.template packet(actual_index); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + const Index actual_row = traits::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value(); + const Index actual_col = traits::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value(); + + return m_argImpl.template packetSegment(actual_row, actual_col, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + const Index actual_index = traits::RowsAtCompileTime == 1 + ? (ColFactor == 1 ? index : index % m_cols.value()) + : (RowFactor == 1 ? index : index % m_rows.value()); + + return m_argImpl.template packetSegment(actual_index, begin, count); + } + protected: const ArgTypeNested m_arg; evaluator m_argImpl; @@ -1457,6 +1740,28 @@ struct evaluator_wrapper_base : evaluator_base { m_argImpl.template writePacket(index, x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return m_argImpl.template packetSegment(row, col, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + return m_argImpl.template packetSegment(index, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment(row, col, x, begin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + m_argImpl.template writePacketSegment(index, x, begin, count); + } + protected: evaluator m_argImpl; }; @@ -1536,41 +1841,97 @@ struct unary_evaluator> : evaluator_base EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - enum { - PacketSize = unpacket_traits::size, - OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, - OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 - }; - typedef internal::reverse_packet_cond reverse_packet; - return reverse_packet::run(m_argImpl.template packet( - ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col)); + static constexpr int PacketSize = unpacket_traits::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + + return reverse_packet::run(m_argImpl.template packet(actualRow, actualCol)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const { - enum { PacketSize = unpacket_traits::size }; - return preverse( - m_argImpl.template packet(m_rows.value() * m_cols.value() - index - PacketSize)); + static constexpr int PacketSize = unpacket_traits::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + + return preverse(m_argImpl.template packet(actualIndex)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { - // FIXME we could factorize some code with packet(i,j) - enum { - PacketSize = unpacket_traits::size, - OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, - OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 - }; - typedef internal::reverse_packet_cond reverse_packet; - m_argImpl.template writePacket(ReverseRow ? m_rows.value() - row - OffsetRow : row, - ReverseCol ? m_cols.value() - col - OffsetCol : col, - reverse_packet::run(x)); + static constexpr int PacketSize = unpacket_traits::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + + m_argImpl.template writePacket(actualRow, actualCol, reverse_packet::run(x)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - enum { PacketSize = unpacket_traits::size }; - m_argImpl.template writePacket(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); + static constexpr int PacketSize = unpacket_traits::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + + m_argImpl.template writePacket(actualIndex, preverse(x)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + static constexpr int PacketSize = unpacket_traits::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin; + + return reverse_packet::run( + m_argImpl.template packetSegment(actualRow, actualCol, actualBegin, count)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const { + static constexpr int PacketSize = unpacket_traits::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + Index actualBegin = PacketSize - count - begin; + + return preverse(m_argImpl.template packetSegment(actualIndex, actualBegin, count)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin, + Index count) { + static constexpr int PacketSize = unpacket_traits::size; + static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + using reverse_packet = reverse_packet_cond; + + Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin; + + m_argImpl.template writePacketSegment(actualRow, actualCol, reverse_packet::run(x), actualBegin, count); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin, + Index count) { + static constexpr int PacketSize = unpacket_traits::size; + + Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + Index actualBegin = PacketSize - count - begin; + + m_argImpl.template writePacketSegment(actualIndex, preverse(x), actualBegin, count); } protected: @@ -1621,7 +1982,7 @@ struct evaluator> : evaluator_base m_argImpl; - const internal::variable_if_dynamicindex m_index; + const variable_if_dynamicindex m_index; private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const { diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 71862fa1c..642f08cc7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -1562,6 +1562,72 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) { return (Packet)pand(result, peven_mask(result)); // atan2 0 atan2 0 ... } +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/ +template +EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits::type* from, Index begin, + Index count) { + using Scalar = typename unpacket_traits::type; + constexpr Index PacketSize = unpacket_traits::size; + eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); + Scalar aux[PacketSize]; + smart_copy(from + begin, from + begin + count, aux + begin); + return ploadu(aux); +} + +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined. \a *from must be aligned, and cannot be null.*/ +template +EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits::type* from, Index begin, + Index count) { + return ploaduSegment(from, begin, count); +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be +null if \a count is zero.*/ +template +EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) { + constexpr Index PacketSize = unpacket_traits::size; + eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range"); + Scalar aux[PacketSize]; + pstoreu(aux, from); + smart_copy(aux + begin, aux + begin + count, to + begin); +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be +null.*/ +template +EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) { + return pstoreuSegment(to, from, begin, count); +} + +/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements + * outside this range are not defined.*/ +template +EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits::type* from, Index begin, + Index count) { + constexpr int RequiredAlignment = unpacket_traits::alignment; + if (Alignment >= RequiredAlignment) { + return ploadSegment(from, begin, count); + } else { + return ploaduSegment(from, begin, count); + } +} + +/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to. +Elements outside of the range [begin, begin + count) are not defined.*/ +template +EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) { + constexpr int RequiredAlignment = unpacket_traits::alignment; + if (Alignment >= RequiredAlignment) { + pstoreSegment(to, from, begin, count); + } else { + pstoreuSegment(to, from, begin, count); + } +} + #ifndef EIGEN_NO_IO template diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 9de64810a..ce8d954bf 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -283,7 +283,7 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, cons template struct generic_product_impl { template - struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {}; + struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {}; typedef typename Product::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose @@ -445,7 +445,7 @@ struct generic_product_impl::extract(lhs).template conjugateIf(), blas_traits::extract(rhs).template conjugateIf(), func, actualAlpha, - std::conditional_t()); + bool_constant()); } protected: @@ -635,6 +635,24 @@ struct product_evaluator, ProductTag, DenseShape, return packet(row, col); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin, + Index count) const { + PacketType res; + typedef etor_product_packet_impl + PacketImpl; + PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count); + return res; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const { + const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index; + const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0; + return packetSegment(row, col, begin, count); + } + protected: add_const_on_value_type_t m_lhs; add_const_on_value_type_t m_rhs; @@ -670,6 +688,13 @@ struct etor_product_packet_impl(lhs.coeff(row, Index(UnrollingIndex - 1))), rhs.template packet(Index(UnrollingIndex - 1), col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + etor_product_packet_impl::run_segment( + row, col, lhs, rhs, innerDim, res, begin, count); + res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex - 1))), + rhs.template packetSegment(Index(UnrollingIndex - 1), col, begin, count), res); + } }; template @@ -681,6 +706,13 @@ struct etor_product_packet_impl(row, Index(UnrollingIndex - 1)), pset1(rhs.coeff(Index(UnrollingIndex - 1), col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + etor_product_packet_impl::run_segment( + row, col, lhs, rhs, innerDim, res, begin, count); + res = pmadd(lhs.template packetSegment(row, Index(UnrollingIndex - 1), begin, count), + pset1(rhs.coeff(Index(UnrollingIndex - 1), col)), res); + } }; template @@ -689,6 +721,12 @@ struct etor_product_packet_impl { Index /*innerDim*/, Packet& res) { res = pmul(pset1(lhs.coeff(row, Index(0))), rhs.template packet(Index(0), col)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index begin, + Index count) { + res = pmul(pset1(lhs.coeff(row, Index(0))), + rhs.template packetSegment(Index(0), col, begin, count)); + } }; template @@ -697,6 +735,12 @@ struct etor_product_packet_impl { Index /*innerDim*/, Packet& res) { res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index begin, + Index count) { + res = pmul(lhs.template packetSegment(row, Index(0), begin, count), + pset1(rhs.coeff(Index(0), col))); + } }; template @@ -705,6 +749,11 @@ struct etor_product_packet_impl { const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { res = pset1(typename unpacket_traits::type(0)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*begin*/, Index /*count*/) { + res = pset1(typename unpacket_traits::type(0)); + } }; template @@ -713,6 +762,11 @@ struct etor_product_packet_impl { const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { res = pset1(typename unpacket_traits::type(0)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*begin*/, Index /*count*/) { + res = pset1(typename unpacket_traits::type(0)); + } }; template @@ -723,6 +777,13 @@ struct etor_product_packet_impl { for (Index i = 0; i < innerDim; ++i) res = pmadd(pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + res = pset1(typename unpacket_traits::type(0)); + for (Index i = 0; i < innerDim; ++i) + res = pmadd(pset1(lhs.coeff(row, i)), rhs.template packetSegment(i, col, begin, count), + res); + } }; template @@ -733,6 +794,13 @@ struct etor_product_packet_impl { for (Index i = 0; i < innerDim; ++i) res = pmadd(lhs.template packet(row, i), pset1(rhs.coeff(i, col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index begin, Index count) { + res = pset1(typename unpacket_traits::type(0)); + for (Index i = 0; i < innerDim; ++i) + res = pmadd(lhs.template packetSegment(row, i, begin, count), pset1(rhs.coeff(i, col)), + res); + } }; /*************************************************************************** @@ -871,6 +939,26 @@ struct diagonal_product_evaluator_base : evaluator_base { m_diagImpl.template packet(id)); } + template + EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count, + internal::true_type) const { + return internal::pmul(m_matImpl.template packetSegment(row, col, begin, count), + internal::pset1(m_diagImpl.coeff(id))); + } + + template + EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count, + internal::false_type) const { + enum { + InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, + DiagonalPacketLoadMode = plain_enum_min( + LoadMode, + ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator::Alignment)) // FIXME hardcoded 16!! + }; + return internal::pmul(m_matImpl.template packetSegment(row, col, begin, count), + m_diagImpl.template packetSegment(id, begin, count)); + } + evaluator m_diagImpl; evaluator m_matImpl; }; @@ -892,7 +980,8 @@ struct product_evaluator, ProductTag, DiagonalSha typedef typename XprType::PlainObject PlainObject; typedef typename Lhs::DiagonalVectorType DiagonalType; - enum { StorageOrder = Base::StorageOrder_ }; + static constexpr int StorageOrder = Base::StorageOrder_; + using IsRowMajor_t = bool_constant; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {} @@ -905,8 +994,7 @@ struct product_evaluator, ProductTag, DiagonalSha EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case. // See also similar calls below. - return this->template packet_impl( - row, col, row, std::conditional_t()); + return this->template packet_impl(row, col, row, IsRowMajor_t()); } template @@ -914,6 +1002,19 @@ struct product_evaluator, ProductTag, DiagonalSha return packet(int(StorageOrder) == ColMajor ? idx : 0, int(StorageOrder) == ColMajor ? 0 : idx); } + + template + EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case. + // See also similar calls below. + return this->template packet_segment_impl(row, col, row, begin, count, IsRowMajor_t()); + } + + template + EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const { + return packetSegment(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx, + begin, count); + } #endif }; @@ -933,7 +1034,8 @@ struct product_evaluator, ProductTag, DenseShape, typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - enum { StorageOrder = Base::StorageOrder_ }; + static constexpr int StorageOrder = Base::StorageOrder_; + using IsColMajor_t = bool_constant; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {} @@ -944,14 +1046,23 @@ struct product_evaluator, ProductTag, DenseShape, #ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return this->template packet_impl( - row, col, col, std::conditional_t()); + return this->template packet_impl(row, col, col, IsColMajor_t()); } template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { - return packet(int(StorageOrder) == ColMajor ? idx : 0, - int(StorageOrder) == ColMajor ? 0 : idx); + return packet(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx); + } + + template + EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const { + return this->template packet_segment_impl(row, col, col, begin, count, IsColMajor_t()); + } + + template + EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const { + return packetSegment(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx, + begin, count); } #endif }; diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index d417c1ad1..dd825e907 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -65,6 +65,31 @@ class generic_dense_assignment_kernel(row, col); } + + template + EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) { + PacketType tmp = m_src.template packetSegment(row, col, begin, count); + const_cast(m_src).template writePacketSegment( + row, col, m_dst.template packetSegment(row, col, begin, count), begin, count); + m_dst.template writePacketSegment(row, col, tmp, begin, count); + } + + template + EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) { + PacketType tmp = m_src.template packetSegment(index, begin, count); + const_cast(m_src).template writePacketSegment( + index, m_dst.template packetSegment(index, begin, count), begin, count); + m_dst.template writePacketSegment(index, tmp, begin, count); + } + + // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I + // mean no CRTP (Gael) + template + EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) { + Index row = Base::rowIndexByOuterInner(outer, inner); + Index col = Base::colIndexByOuterInner(outer, inner); + assignPacketSegment(row, col, begin, count); + } }; } // namespace internal diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 9887db67a..1342478cd 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -36,6 +36,10 @@ template class PartialReduxExpr; namespace internal { + +template +struct enable_packet_segment> : std::false_type {}; + template struct traits > : traits { typedef typename MemberOp::result_type Scalar; diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 09fa20b7c..a4a87c4fc 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -507,6 +507,57 @@ EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, con return pnegate(pmadd(a, b, c)); } #endif + +/*---------------- load/store segment support ----------------*/ + +/*---------------- std::complex ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet2cf ploaduSegment(const std::complex* from, Index begin, Index count) { + return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count)); +} + +template <> +inline void pstoreuSegment, Packet2cf>(std::complex* to, const Packet2cf& from, Index begin, + Index count) { + _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v); +} + +template <> +inline Packet4cf ploaduSegment(const std::complex* from, Index begin, Index count) { + return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count)); +} + +template <> +inline void pstoreuSegment, Packet4cf>(std::complex* to, const Packet4cf& from, Index begin, + Index count) { + _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v); +} + +/*---------------- std::complex ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet2cd ploaduSegment(const std::complex* from, Index begin, Index count) { + return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count)); +} + +template <> +inline void pstoreuSegment, Packet2cd>(std::complex* to, const Packet2cd& from, + Index begin, Index count) { + _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v); +} + +/*---------------- end load/store segment support ----------------*/ + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 31a1d8ac8..fb49206ea 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -2938,6 +2938,258 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47); } +/*---------------- load/store segment support ----------------*/ + +// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_4x8(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 4); + long long mask = 1; + mask <<= CHAR_BIT * count; + mask--; + mask <<= CHAR_BIT * begin; +#if defined(_WIN32) && !defined(_WIN64) + return _mm_loadl_epi64(reinterpret_cast(&mask)); +#else + return _mm_cvtsi64_si128(mask); +#endif +} + +// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_8x8(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 8); + long long mask = 1; + // avoid UB when count == 8 + mask <<= (CHAR_BIT / 2) * count; + mask <<= (CHAR_BIT / 2) * count; + mask--; + mask <<= CHAR_BIT * begin; +#if defined(_WIN32) && !defined(_WIN64) + return _mm_loadl_epi64(reinterpret_cast(&mask)); +#else + return _mm_cvtsi64_si128(mask); +#endif +} + +// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_4x32(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 4); + return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count)); +} + +// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m128i segment_mask_2x64(Index begin, Index count) { + eigen_assert(begin >= 0 && begin + count <= 2); + return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count)); +} + +// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m256i segment_mask_8x32(Index begin, Index count) { + __m128i mask_epi8 = segment_mask_8x8(begin, count); +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8); +#else + __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8); + __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32)); + __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1); +#endif + return mask_epi32; +} + +// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere. +inline __m256i segment_mask_4x64(Index begin, Index count) { + __m128i mask_epi8 = segment_mask_4x8(begin, count); +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8); +#else + __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8); + __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16)); + __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1); +#endif + return mask_epi64; +} + +/*---------------- float ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet4f ploaduSegment(const float* from, Index begin, Index count) { + return _mm_maskload_ps(from, segment_mask_4x32(begin, count)); +} + +template <> +inline void pstoreuSegment(float* to, const Packet4f& from, Index begin, Index count) { + _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from); +} + +template <> +inline Packet8f ploaduSegment(const float* from, Index begin, Index count) { + return _mm256_maskload_ps(from, segment_mask_8x32(begin, count)); +} + +template <> +inline void pstoreuSegment(float* to, const Packet8f& from, Index begin, Index count) { + _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from); +} + +/*---------------- int32 ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +#ifdef EIGEN_VECTORIZE_AVX2 + +template <> +inline Packet4i ploaduSegment(const int* from, Index begin, Index count) { + return _mm_maskload_epi32(from, segment_mask_4x32(begin, count)); +} + +template <> +inline void pstoreuSegment(int* to, const Packet4i& from, Index begin, Index count) { + _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from); +} + +template <> +inline Packet8i ploaduSegment(const int* from, Index begin, Index count) { + return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count)); +} + +template <> +inline void pstoreuSegment(int* to, const Packet8i& from, Index begin, Index count) { + _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from); +} + +#else + +template <> +inline Packet4i ploaduSegment(const int* from, Index begin, Index count) { + return _mm_castps_si128(ploaduSegment(reinterpret_cast(from), begin, count)); +} + +template <> +inline void pstoreuSegment(int* to, const Packet4i& from, Index begin, Index count) { + pstoreuSegment(reinterpret_cast(to), _mm_castsi128_ps(from), begin, count); +} + +template <> +inline Packet8i ploaduSegment(const int* from, Index begin, Index count) { + return _mm256_castps_si256(ploaduSegment(reinterpret_cast(from), begin, count)); +} + +template <> +inline void pstoreuSegment(int* to, const Packet8i& from, Index begin, Index count) { + pstoreuSegment(reinterpret_cast(to), _mm256_castsi256_ps(from), begin, count); +} + +#endif + +/*---------------- uint32 ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet4ui ploaduSegment(const uint32_t* from, Index begin, Index count) { + return Packet4ui(ploaduSegment(reinterpret_cast(from), begin, count)); +} + +template <> +inline void pstoreuSegment(uint32_t* to, const Packet4ui& from, Index begin, Index count) { + pstoreuSegment(reinterpret_cast(to), Packet4i(from), begin, count); +} + +template <> +inline Packet8ui ploaduSegment(const uint32_t* from, Index begin, Index count) { + return Packet8ui(ploaduSegment(reinterpret_cast(from), begin, count)); +} + +template <> +inline void pstoreuSegment(uint32_t* to, const Packet8ui& from, Index begin, Index count) { + pstoreuSegment(reinterpret_cast(to), Packet8i(from), begin, count); +} + +/*---------------- double ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet2d ploaduSegment(const double* from, Index begin, Index count) { + return _mm_maskload_pd(from, segment_mask_2x64(begin, count)); +} + +template <> +inline void pstoreuSegment(double* to, const Packet2d& from, Index begin, Index count) { + _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from); +} + +template <> +inline Packet4d ploaduSegment(const double* from, Index begin, Index count) { + return _mm256_maskload_pd(from, segment_mask_4x64(begin, count)); +} + +template <> +inline void pstoreuSegment(double* to, const Packet4d& from, Index begin, Index count) { + _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from); +} + +#ifdef EIGEN_VECTORIZE_AVX2 + +/*---------------- int64_t ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet2l ploaduSegment(const int64_t* from, Index begin, Index count) { + return _mm_maskload_epi64(reinterpret_cast(from), segment_mask_2x64(begin, count)); +} +template <> +inline void pstoreuSegment(int64_t* to, const Packet2l& from, Index begin, Index count) { + _mm_maskstore_epi64(reinterpret_cast(to), segment_mask_2x64(begin, count), from); +} +template <> +inline Packet4l ploaduSegment(const int64_t* from, Index begin, Index count) { + return _mm256_maskload_epi64(reinterpret_cast(from), segment_mask_4x64(begin, count)); +} +template <> +inline void pstoreuSegment(int64_t* to, const Packet4l& from, Index begin, Index count) { + _mm256_maskstore_epi64(reinterpret_cast(to), segment_mask_4x64(begin, count), from); +} + +/*---------------- uint64_t ----------------*/ + +template <> +struct has_packet_segment : std::true_type {}; + +template <> +inline Packet4ul ploaduSegment(const uint64_t* from, Index begin, Index count) { + return Packet4ul(ploaduSegment(reinterpret_cast(from), begin, count)); +} +template <> +inline void pstoreuSegment(uint64_t* to, const Packet4ul& from, Index begin, Index count) { + pstoreuSegment(reinterpret_cast(to), Packet4l(from), begin, count); +} +#endif + +/*---------------- end load/store segment support ----------------*/ + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 7edcc603c..0239262ae 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -29,6 +29,11 @@ struct assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { pstoret(a, b); } + + template + EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const { + pstoretSegment(a, b, begin, count); + } }; // Empty overload for void type (used by PermutationMatrix) @@ -60,6 +65,12 @@ struct compound_assign_op { assign_op().template assignPacket( a, Func().packetOp(ploadt(a), b)); } + + template + EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const { + assign_op().template assignPacketSegment( + a, Func().packetOp(ploadtSegment(a, begin, count), b), begin, count); + } }; template diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index c91e6bb52..a93b998b9 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -438,7 +438,6 @@ struct scalar_quotient_op : binary_op_base { } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { - maybe_raise_div_by_zero::run(b); return internal::pdiv(a, b); } }; diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 14b56d733..35dc73869 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -28,7 +28,7 @@ struct scalar_constant_op { const Scalar m_other; }; template -struct functor_traits > { +struct functor_traits> { enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */, PacketAccess = packet_traits::Vectorizable, @@ -56,7 +56,7 @@ struct scalar_identity_op { } }; template -struct functor_traits > { +struct functor_traits> { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; @@ -86,18 +86,19 @@ struct linspaced_op_impl { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) + Packet low = pset1(m_low); + Packet high = pset1(m_high); + Packet step = pset1(m_step); if (m_flip) { Packet pi = plset(Scalar(i - m_size1)); - Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != 0)) return res; - Packet mask = pcmp_lt(pset1(0), plset(0)); - return pselect(mask, res, pset1(m_low)); + Packet res = pmadd(step, pi, high); + Packet mask = pcmp_lt(pzero(res), plset(Scalar(i))); + return pselect(mask, res, low); } else { Packet pi = plset(Scalar(i)); - Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits::size + 1)) return res; - Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size - 1)); - return pselect(mask, res, pset1(m_high)); + Packet res = pmadd(step, pi, low); + Packet mask = pcmp_lt(pi, pset1(Scalar(m_size1))); + return pselect(mask, res, high); } } @@ -139,7 +140,7 @@ struct linspaced_op_impl { template struct linspaced_op; template -struct functor_traits > { +struct functor_traits> { enum { Cost = 1, PacketAccess = (!NumTraits::IsInteger) && packet_traits::HasSetLinear, @@ -192,7 +193,7 @@ struct equalspaced_op { }; template -struct functor_traits > { +struct functor_traits> { enum { Cost = NumTraits::AddCost + NumTraits::MulCost, PacketAccess = diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 2488be46f..8d1073c86 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -514,6 +514,12 @@ template struct eigen_memset_helper; template ::value> struct eigen_zero_impl; + +template +struct has_packet_segment : std::false_type {}; + +template +struct enable_packet_segment : std::true_type {}; } // namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 40604f8ea..531746622 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -90,12 +90,8 @@ namespace internal { * we however don't want to add a dependency to Boost. */ -struct true_type { - enum { value = 1 }; -}; -struct false_type { - enum { value = 0 }; -}; +using std::false_type; +using std::true_type; template struct bool_constant; diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a42bb0f73..75a78a1a2 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -996,6 +996,27 @@ struct is_matrix_base_xpr : std::is_base_of>, r template struct is_permutation_base_xpr : std::is_base_of>, remove_all_t> {}; +/*---------------- load/store segment support ----------------*/ + +// recursively traverse unary, binary, and ternary expressions to determine if packet segments are supported + +template +struct enable_packet_segment> : enable_packet_segment> {}; + +template +struct enable_packet_segment> : enable_packet_segment> {}; + +template +struct enable_packet_segment> + : bool_constant>::value && + enable_packet_segment>::value> {}; + +template +struct enable_packet_segment> + : bool_constant>::value && + enable_packet_segment>::value && + enable_packet_segment>::value> {}; + } // end namespace internal /** \class ScalarBinaryOpTraits diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h index a45be20b2..c603a38a2 100644 --- a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h @@ -308,19 +308,24 @@ struct dense_assignment_loop_with_devicetemplate assignPacket(index); } }; + static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment; + using head_loop = + unaligned_dense_assignment_loop; + using tail_loop = unaligned_dense_assignment_loop; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { const Index size = kernel.size(); const Index alignedStart = DstIsAligned ? 0 : internal::first_aligned(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); - unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); + head_loop::run(kernel, 0, alignedStart); constexpr float cost = static_cast(XprEvaluationCost); AssignmentFunctor functor(kernel); device.template parallelFor(alignedStart, alignedEnd, functor, cost); - unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + tail_loop::run(kernel, alignedEnd, size); } }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 813cc531c..e62ec4529 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -186,6 +186,7 @@ ei_add_test(mixingtypes) ei_add_test(float_conversion) ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") +ei_add_test(packet_segment) ei_add_test(vectorization_logic) ei_add_test(basicstuff) ei_add_test(constexpr) diff --git a/test/packet_segment.cpp b/test/packet_segment.cpp new file mode 100644 index 000000000..6fa6a290d --- /dev/null +++ b/test/packet_segment.cpp @@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template +void verify_data(const Scalar* data_in, const Scalar* data_out, const Packet& a, Index begin, Index count) { + constexpr int PacketSize = internal::unpacket_traits::size; + bool ok = true; + for (Index i = begin; i < begin + count; i++) { + ok = ok && numext::equal_strict(data_in[i], data_out[i]); + } + if (!ok) { + std::cout << "begin: " << begin << ", count: " << count << "\n"; + std::cout << "Scalar type: " << type_name(Scalar()) << " x " << PacketSize << "\n"; + std::cout << "data in: {"; + for (Index i = 0; i < PacketSize; i++) { + if (i > 0) std::cout << ","; + if (i < begin || i >= begin + count) { + std::cout << "MASK"; + } else { + std::cout << data_in[i]; + } + } + std::cout << "}\n"; + std::cout << "data out: {"; + for (Index i = 0; i < PacketSize; i++) { + if (i > 0) std::cout << ","; + if (i < begin || i >= begin + count) { + std::cout << "MASK"; + } else { + std::cout << data_out[i]; + } + } + std::cout << "}\n"; + std::cout << "packet: "; + std::cout << internal::postream(a) << "\n"; + } + VERIFY(ok); +} + +template ::value> +struct packet_segment_test_impl { + using Packet = typename internal::find_packet_by_size::type; + static void test_unaligned() { + // test loading a packet segment from unaligned memory that includes unallocated memory + + // | X X X X | * * * X | X X X X | + // begin -> { X | * * * } <- begin + count + + VectorX data_in(PacketSize), data_out(PacketSize); + data_in.setRandom(); + data_out.setRandom(); + + Scalar* unaligned_data_in = data_in.data() - 1; + Scalar* unaligned_data_out = data_out.data() - 1; + + Index begin = 1; + Index count = PacketSize - 1; + + Packet a = internal::ploaduSegment(unaligned_data_in, begin, count); + internal::pstoreuSegment(unaligned_data_out, a, begin, count); + + verify_data(unaligned_data_in, unaligned_data_out, a, begin, count); + + // test loading the entire packet + + data_in.setRandom(); + data_out.setRandom(); + + unaligned_data_in = data_in.data(); + unaligned_data_out = data_out.data(); + + begin = 0; + count = PacketSize; + + Packet b = internal::ploaduSegment(unaligned_data_in, begin, count); + internal::pstoreuSegment(unaligned_data_out, b, begin, count); + + verify_data(unaligned_data_in, unaligned_data_out, b, begin, count); + + // test loading an empty packet segment in unallocated memory + count = 0; + + for (begin = 0; begin < PacketSize; begin++) { + data_in.setRandom(); + data_out = data_in; + Packet c = internal::ploaduSegment(data_in.data(), begin, count); + internal::pstoreuSegment(data_out.data(), c, begin, count); + // verify that ploaduSegment / pstoreuSegment did nothing + VERIFY_IS_CWISE_EQUAL(data_in, data_out); + } + } + static void test_aligned() { + // test loading a packet segment from aligned memory that includes unallocated memory + + // | X X X X | * * * X | X X X X | + // begin -> { * * * X } <- begin + count + + VectorX data_in(PacketSize - 1), data_out(PacketSize - 1); + data_in.setRandom(); + data_out.setRandom(); + + Scalar* aligned_data_in = data_in.data(); + Scalar* aligned_data_out = data_out.data(); + + Index begin = 0; + Index count = PacketSize - 1; + + Packet b = internal::ploadSegment(aligned_data_in, begin, count); + internal::pstoreSegment(aligned_data_out, b, begin, count); + + verify_data(aligned_data_in, aligned_data_out, b, begin, count); + } + static void run() { + test_unaligned(); + test_aligned(); + } +}; + +template +struct packet_segment_test_impl { + static void run() {} +}; + +template +struct packet_segment_test_driver { + static void run() { + packet_segment_test_impl::run(); + packet_segment_test_driver::run(); + } +}; + +template +struct packet_segment_test_driver { + static void run() {} +}; + +template +void test_packet_segment() { + packet_segment_test_driver::size>::run(); +} + +EIGEN_DECLARE_TEST(packet_segment) { + for (int i = 0; i < g_repeat; i++) { + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment(); + test_packet_segment>(); + test_packet_segment>(); + } +}