diff --git a/Eigen/src/Array/Functors.h b/Eigen/src/Array/Functors.h index 3357f145d..5b53a9cee 100644 --- a/Eigen/src/Array/Functors.h +++ b/Eigen/src/Array/Functors.h @@ -25,19 +25,21 @@ #ifndef EIGEN_ARRAY_FUNCTORS_H #define EIGEN_ARRAY_FUNCTORS_H +/** \internal + * \array_module + * + * \brief Template functor to add a scalar to a fixed other one + * + * \sa class CwiseUnaryOp, Array::operator+ + */ +/* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */ template -struct ei_scalar_add_op { +struct ei_scalar_add_op { typedef typename ei_packet_traits::type PacketScalar; - inline ei_scalar_add_op(const Scalar& other) : m_other(ei_pset1(other)) { } - inline Scalar operator() (const Scalar& a) const { return a + ei_pfirst(m_other); } - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_padd(a, m_other); } - const PacketScalar m_other; -}; -template -struct ei_scalar_add_op { inline ei_scalar_add_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a + m_other; } + inline const PacketScalar packetOp(const PacketScalar& a) const + { return ei_padd(a, ei_pset1(m_other)); } const Scalar m_other; }; template diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 8c012b58f..3ce72f5ba 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -181,6 +181,8 @@ static void ei_cache_friendly_product( if (PacketSize>1 && size_t(rhsColumn)%16) { int count = 0; + // FIXME this loop get vectorized by the compiler (ICC) + // I'm not sure thats good or not for (int k = l2k; k1 && size_t(rhsColumn)%16) { int count = 0; + // FIXME this loop get vectorized by the compiler ! for (int k = l2k; k1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask; + const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==2 ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size1) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipColumns = 0; - } - else - { - skipColumns = std::min(skipColumns,rhs.size()); - // note that the skiped columns are processed later. - } - - ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1 - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || sizealignedStart) + if (PacketSize>1) { - switch(alignmentPattern) + /* explicit vectorization */ + + // process initial unaligned coeffs + for (int j=0; jalignedStart) { - case AllAligned: - for (int j = alignedStart; j1) - { - // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient - // than the following code - asm("#mybegin"); - Packet A00, A01, A02, A03, A10, A11, A12, A13; - for (int j = alignedStart; j1) { - A01 = ei_ploadu(&lhs1[j]); A11 = ei_ploadu(&lhs1[j+PacketSize]); - A02 = ei_ploadu(&lhs2[j]); A12 = ei_ploadu(&lhs2[j+PacketSize]); - A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); + // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient + // than the following code + asm("#mybegin"); + Packet A00, A01, A02, A03, A10, A11, A12, A13; + for (int j = alignedStart; j1) + { + /* explicit vectorization */ + // process first unaligned result's coeffs + for (int j=0; j1); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS } @@ -562,78 +581,92 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. const int alignedStart = ei_alignmentOffset(rhs, size); - const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); + const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; //const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; - const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask; + const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==2 ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || PacketSize==1 || size1) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipRows = 0; + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size1) { - Scalar b = rhs[j]; - tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; - } - - if (alignedSize>alignedStart) - { - switch(alignmentPattern) + /* explicit vectorization */ + Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); + + // process initial unaligned coeffs + // FIXME this loop get vectorized by the compiler ! + for (int j=0; jalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (int j = alignedStart; j1); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS diff --git a/Eigen/src/Core/Cwise.h b/Eigen/src/Core/Cwise.h index adfcfb2e8..d89800faa 100644 --- a/Eigen/src/Core/Cwise.h +++ b/Eigen/src/Core/Cwise.h @@ -26,15 +26,6 @@ #ifndef EIGEN_CWISE_H #define EIGEN_CWISE_H -/** \internal - * \array_module - * - * \brief Template functor to add a scalar to a fixed other one - * - * \sa class CwiseUnaryOp, Array::operator+ - */ -template::size)>1?true:false) > struct ei_scalar_add_op; - /** \internal * convenient macro to defined the return type of a cwise binary operation */ #define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \ diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index cfbc7affb..e868f2eee 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -247,21 +247,21 @@ struct ei_functor_traits > * * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/ */ -template::size)>1) > struct ei_scalar_multiple_op; - +/* NOTE why doing the ei_pset1() *is* an optimization ? + * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once + * in the constructor. However, in practice: + * - GCC does not like m_other as a PacketScalar and generate a load every time it needs it + * - one the other hand GCC is able to moves the ei_pset1() away the loop :) + * - simpler code ;) + * (ICC performs well in both cases) + */ template -struct ei_scalar_multiple_op { +struct ei_scalar_multiple_op { typedef typename ei_packet_traits::type PacketScalar; - inline ei_scalar_multiple_op(const Scalar& other) : m_other(ei_pset1(other)) { } - inline Scalar operator() (const Scalar& a) const { return a * ei_pfirst(m_other); } - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pmul(a, m_other); } - const PacketScalar m_other; -}; -template -struct ei_scalar_multiple_op { inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a * m_other; } + inline const PacketScalar packetOp(const PacketScalar& a) const + { return ei_pmul(a, ei_pset1(m_other)); } const Scalar m_other; }; template @@ -270,13 +270,16 @@ struct ei_functor_traits > template struct ei_scalar_quotient1_impl { + typedef typename ei_packet_traits::type PacketScalar; inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast(1) / other) {} inline Scalar operator() (const Scalar& a) const { return a * m_other; } + inline const PacketScalar packetOp(const PacketScalar& a) const + { return ei_pmul(a, ei_pset1(m_other)); } const Scalar m_other; }; template struct ei_functor_traits > -{ enum { Cost = NumTraits::MulCost, PacketAccess = false }; }; +{ enum { Cost = NumTraits::MulCost, PacketAccess = ei_packet_traits::size>1 }; }; template struct ei_scalar_quotient1_impl { @@ -305,22 +308,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl::size)>1) > struct ei_scalar_constant_op; - template -struct ei_scalar_constant_op { +struct ei_scalar_constant_op { typedef typename ei_packet_traits::type PacketScalar; - inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { } - inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); } - inline const PacketScalar packetOp() const - { return m_other; } - const PacketScalar m_other; -}; -template -struct ei_scalar_constant_op { inline ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { } inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { } inline const Scalar operator() (int, int = 0) const { return m_other; } + inline const PacketScalar packetOp() const { return ei_pset1(m_other); } const Scalar m_other; }; template diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 1fba262e2..76a2f60cf 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -81,11 +81,12 @@ template struct ei_scalar_inverse_op; template struct ei_scalar_square_op; template struct ei_scalar_cube_op; template struct ei_scalar_cast_op; -template struct ei_scalar_multiple_op; +template struct ei_scalar_multiple_op; template struct ei_scalar_quotient1_op; template struct ei_scalar_min_op; template struct ei_scalar_max_op; template struct ei_scalar_random_op; +template struct ei_scalar_add_op; template void ei_cache_friendly_product(