diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 39efb1d5a..f589555cc 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -365,13 +365,14 @@ struct dense_assignment_loop typedef typename Kernel::Scalar Scalar; typedef packet_traits PacketTraits; enum { + requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment, packetSize = PacketTraits::size, - dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment), - dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment) + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = PacketTraits::AlignedOnScalar ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment), srcAlignment = Kernel::AssignmentTraits::JointAlignment }; - const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size); + const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size); const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); @@ -479,9 +480,10 @@ struct dense_assignment_loop typedef packet_traits PacketTraits; enum { packetSize = PacketTraits::size, + requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment), alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), - dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment), - dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment) + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = alignable ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment) }; const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0); @@ -494,7 +496,7 @@ struct dense_assignment_loop const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0; - Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize); + Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize); for(Index outer = 0; outer < outerSize; ++outer) { diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 11e2a1809..d053911e3 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -580,33 +580,41 @@ class DenseCoeffsBase namespace internal { -template +template struct first_aligned_impl { static inline Index run(const Derived&) { return 0; } }; -template -struct first_aligned_impl +template +struct first_aligned_impl { static inline Index run(const Derived& m) { - return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size()); + return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size()); } }; -/** \internal \returns the index of the first element of the array that is well aligned for vectorization. +/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization. + * + * \tparam Alignment requested alignment in Bytes. * * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more * documentation. */ -template +template static inline Index first_aligned(const DenseBase& m) { - return first_aligned_impl - ::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment! - ::run(m.derived()); + enum { ReturnZero = (int(evaluator::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) }; + return first_aligned_impl::run(m.derived()); +} + +template +static inline Index first_default_aligned(const DenseBase& m) +{ + typedef typename Derived::Scalar Scalar; + return first_aligned::size*sizeof(Scalar)>(m); } template::ret> diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0c25223aa..fa308b53e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -221,12 +221,13 @@ struct redux_impl { const Index size = mat.size(); - const Index packetSize = packet_traits::size; - const Index alignedStart = internal::first_aligned(mat.nestedExpression()); + const Index packetSize = packet_traits::size; + const int packetBytes = int(packetSize*sizeof(Scalar)); enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement + alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetBytes) : int(Unaligned), // FIXME take into account alignment requirement alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) }; + const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index aca81f463..7fe39808b 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -178,7 +178,7 @@ MatrixBase::stableNorm() const if(n==1) return abs(this->coeff(0)); - Index bi = internal::first_aligned(copy); + Index bi = internal::first_default_aligned(copy); if (bi>0) internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); for (; bi1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 5d6ef9913..f3443bd10 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product& blocking) { + const Index PacketBytes = packet_traits::size*sizeof(Scalar); // strip zeros Index diagSize = (std::min)(_cols,_depth); Index rows = _rows; @@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; - geb = geb + internal::first_aligned(geb,EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar)); + geb = geb + internal::first_aligned(geb,PacketBytes/sizeof(Scalar)); pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 934948ebd..d00fa9707 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -230,7 +230,7 @@ class blas_data_mapper { if (size_t(m_data)%sizeof(Scalar)) { return -1; } - return internal::first_aligned(m_data, size); + return internal::first_default_aligned(m_data, size); } protected: diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index c9517acfc..957c36bcf 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -506,47 +506,56 @@ template EIGEN_DEVICE_FUNC inline void conditional_align /****************************************************************************/ -/** \internal Returns the index of the first element of the array that is well aligned for vectorization. +/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment. * + * \tparam Alignment requested alignment in Bytes. * \param array the address of the start of the array * \param size the size of the array * - * \note If no element of the array is well aligned, the size of the array is returned. Typically, - * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the + * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar, + * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If * packet size for the given scalar type is 1, then everything is considered well-aligned. * - * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a - * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the - * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for + * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a + * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for * example with Scalar=double on certain 32-bit platforms, see bug #79. * * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h. + * \sa first_default_aligned() */ -template +template inline Index first_aligned(const Scalar* array, Index size) { - static const Index PacketSize = packet_traits::size; - static const Index PacketAlignedMask = PacketSize-1; + static const Index ScalarSize = sizeof(Scalar); + static const Index AlignmentSize = Alignment / ScalarSize; + static const Index AlignmentMask = AlignmentSize-1; - if(PacketSize==1) + if(AlignmentSize<=1) { - // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements - // of the array have the same alignment. + // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar + // so that all elements of the array have the same alignment. return 0; } - else if(size_t(array) & (sizeof(Scalar)-1)) + else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0) { - // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar. + // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size. // Consequently, no element of the array is well aligned. return size; } else { - return std::min( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask)) - & PacketAlignedMask, size); + return std::min( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size); } } +/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement. + * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase) */ +template +inline Index first_default_aligned(const Scalar* array, Index size) +{ + return first_aligned::size*sizeof(Scalar)>(array, size); +} + /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size */ template diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index b7b83dcd2..55de15e87 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -325,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x // both vectors are sequentially stored in memory => vectorization enum { Peeling = 2 }; - Index alignedStart = internal::first_aligned(y, size); + Index alignedStart = internal::first_default_aligned(y, size); Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; const Packet pc = pset1(c); @@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Scalar* EIGEN_RESTRICT px = x + alignedStart; Scalar* EIGEN_RESTRICT py = y + alignedStart; - if(internal::first_aligned(x, size)==alignedStart) + if(internal::first_default_aligned(x, size)==alignedStart) { for(Index i=alignedStart; i::run(const Index seg const Index PacketSize = internal::packet_traits::size; Index ldl = internal::first_multiple(nrow, PacketSize); Map, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) ); - Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize); - Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize; + Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize); + Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize; Map, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); l.setZero(); diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index bd3cf87b9..ca78406e0 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -145,7 +145,7 @@ void SparseLUImpl::panel_bmod(const Index m, const Index w, eigen_assert(tempv.size()>w*ldu + nrow*w + 1); Index ldl = internal::first_multiple(nrow, PacketSize); - Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize; + Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize; Map, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); L.setZero(); diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp index 467f94510..bf22f6b97 100644 --- a/test/first_aligned.cpp +++ b/test/first_aligned.cpp @@ -13,7 +13,7 @@ template void test_first_aligned_helper(Scalar *array, int size) { const int packet_size = sizeof(Scalar) * internal::packet_traits::size; - VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_aligned(array, size)) % packet_size) == 0); + VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_default_aligned(array, size)) % packet_size) == 0); } template @@ -21,7 +21,7 @@ void test_none_aligned_helper(Scalar *array, int size) { EIGEN_UNUSED_VARIABLE(array); EIGEN_UNUSED_VARIABLE(size); - VERIFY(internal::packet_traits::size == 1 || internal::first_aligned(array, size) == size); + VERIFY(internal::packet_traits::size == 1 || internal::first_default_aligned(array, size) == size); } struct some_non_vectorizable_type { float x; };