removed the packet specializations of some functors

(GCC generates better code without those "optimizations")
2025-09-11 00:43:13 +08:00 · 2008-07-31 21:03:11 +00:00 · 2008-07-31 21:03:11 +00:00 · b32b186c14
commit b32b186c14
parent 842c4f8bfa
5 changed files with 189 additions and 166 deletions
--- a/Eigen/src/Array/Functors.h
+++ b/Eigen/src/Array/Functors.h
@ -25,19 +25,21 @@
 #ifndef EIGEN_ARRAY_FUNCTORS_H
 #define EIGEN_ARRAY_FUNCTORS_H
 /** \internal
  * \array_module
  *
  * \brief Template functor to add a scalar to a fixed other one
  *
  * \sa class CwiseUnaryOp, Array::operator+
  */
 /* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */
 template<typename Scalar>
-struct ei_scalar_add_op<Scalar,true> {
+struct ei_scalar_add_op {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
  inline ei_scalar_add_op(const Scalar& other) : m_other(ei_pset1(other)) { }
  inline Scalar operator() (const Scalar& a) const { return a + ei_pfirst(m_other); }
  inline const PacketScalar packetOp(const PacketScalar& a) const
  { return ei_padd(a, m_other); }
  const PacketScalar m_other;
 };
 template<typename Scalar>
 struct ei_scalar_add_op<Scalar,false> {
  inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
  { return ei_padd(a, ei_pset1(m_other)); }
  const Scalar m_other;
 };
 template<typename Scalar>
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@ -181,6 +181,8 @@ static void ei_cache_friendly_product(
            if (PacketSize>1 && size_t(rhsColumn)%16)
            {
              int count = 0;
              // FIXME this loop get vectorized by the compiler (ICC)
              // I'm not sure thats good or not
              for (int k = l2k; k<l2blockSizeEnd; ++k)
              {
                rhsCopy[count++] = rhsColumn[k];
@ -266,6 +268,7 @@ static void ei_cache_friendly_product(
            if (PacketSize>1 && size_t(rhsColumn)%16)
            {
              int count = 0;
              // FIXME this loop get vectorized by the compiler !
              for (int k = l2k; k<l2blockSizeEnd; ++k)
              {
                rhsCopy[count++] = rhsColumn[k];
@ -335,6 +338,7 @@ static void ei_cache_friendly_product(
        for (int i=0; i<rows; ++i)
        {
          Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
          // FIXME this loop get vectorized by the compiler !
          for (int k=1; k<remainingSize; ++k)
            tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
          res[i+j*resStride] += tmp;
@ -397,20 +401,23 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
  // How many coeffs of the result do we have to skip to be aligned.
  // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
  const int alignedStart = ei_alignmentOffset(res,size);
-  const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
+  const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
  const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
-  const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
+  const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
  int alignmentPattern = alignmentStep==0 ? AllAligned
                       : alignmentStep==2 ? EvenAligned
                       : FirstAligned;
  // we cannot assume the first element is aligned because of sub-matrices
  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize || PacketSize==1);
  // find how many columns do we have to skip to be aligned with the result (if possible)
-  int skipColumns=0;
+  int skipColumns = 0;
  if (PacketSize>1)
  {
    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
    for (; skipColumns<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipColumns; ++skipColumns)
    {}
    if (skipColumns==PacketSize)
@ -425,17 +432,23 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
      // note that the skiped columns are processed later.
    }
-  ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
+    ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
-    || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
+  }
  int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
  for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
  {
    Packet ptmp0 = ei_pset1(rhs[i]),   ptmp1 = ei_pset1(rhs[i+1]),
           ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]);
    // this helps a lot generating better binary code
    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
    if (PacketSize>1)
    {
      /* explicit vectorization */
      // process initial unaligned coeffs
      for (int j=0; j<alignedStart; j++)
        res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
@ -485,8 +498,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
            break;
        }
      }
    } // end explit vectorization
-    // process remaining coeffs
+    /* process remaining coeffs (or all if there is no explicit vectorization) */
    for (int j=alignedSize; j<size; j++)
      res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
  }
@ -500,6 +514,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
    {
      Packet ptmp0 = ei_pset1(rhs[i]);
      const Scalar* lhs0 = lhs + i*lhsStride;
      if (PacketSize>1)
      {
        /* explicit vectorization */
        // process first unaligned result's coeffs
        for (int j=0; j<alignedStart; j++)
          res[j] += ei_pfirst(ptmp0) * lhs0[j];
@ -511,8 +529,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
        else
          for (int j = alignedStart;j<alignedSize;j+=PacketSize)
            ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j])));
      }
-      // process remaining scalars
+      // process remaining scalars (or all if no explicit vectorization)
      for (int j=alignedSize; j<size; j++)
        res[j] += ei_pfirst(ptmp0) * lhs0[j];
    }
@ -524,7 +543,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
    }
    else
      break;
-  } while(true);
+  } while(PacketSize>1);
  asm("#end matrix_vector_product");
  #undef _EIGEN_ACCUMULATE_PACKETS
 }
@ -562,19 +581,23 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
  // How many coeffs of the result do we have to skip to be aligned.
  // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
  const int alignedStart = ei_alignmentOffset(rhs, size);
-  const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
+  const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
  //const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
-  const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
+  const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
  int alignmentPattern = alignmentStep==0 ? AllAligned
                       : alignmentStep==2 ? EvenAligned
                       : FirstAligned;
  // we cannot assume the first element is aligned because of sub-matrices
  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
-  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || PacketSize==1 || size<PacketSize);
+  
  // find how many rows do we have to skip to be aligned with rhs (if possible)
-  int skipRows=0;
+  int skipRows = 0;
  if (PacketSize>1)
  {
    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || size<PacketSize);
    for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows)
    {}
    if (skipRows==PacketSize)
@ -590,16 +613,24 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
    }
    ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
  }
  int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
    Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
-    Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
+
    // this helps the compiler generating good binary code
    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
    if (PacketSize>1)
    {
      /* explicit vectorization */
      Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
      // process initial unaligned coeffs
      // FIXME this loop get vectorized by the compiler !
      for (int j=0; j<alignedStart; j++)
      {
        Scalar b = rhs[j];
@ -632,8 +663,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
        tmp2 += ei_predux(ptmp2);
        tmp3 += ei_predux(ptmp3);
      }
    } // end explicit vectorization
-    // process remaining coeffs
+    // process remaining coeffs (or all if no explicit vectorization)
    // FIXME this loop get vectorized by the compiler !
    for (int j=alignedSize; j<size; j++)
    {
      Scalar b = rhs[j];
@ -653,6 +686,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
      Packet ptmp0 = ei_pset1(tmp0);
      const Scalar* lhs0 = lhs + i*lhsStride;
      // process first unaligned result's coeffs
      // FIXME this loop get vectorized by the compiler !
      for (int j=0; j<alignedStart; j++)
        tmp0 += rhs[j] * lhs0[j];
@ -669,6 +703,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
      }
      // process remaining scalars
      // FIXME this loop get vectorized by the compiler !
      for (int j=alignedSize; j<size; j++)
        tmp0 += rhs[j] * lhs0[j];
      res[i] += tmp0;
@ -681,7 +716,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
    }
    else
      break;
-  } while(true);
+  } while(PacketSize>1);
  asm("#end matrix_vector_product");
  #undef _EIGEN_ACCUMULATE_PACKETS
--- a/Eigen/src/Core/Cwise.h
+++ b/Eigen/src/Core/Cwise.h
@ -26,15 +26,6 @@
 #ifndef EIGEN_CWISE_H
 #define EIGEN_CWISE_H
 /** \internal
  * \array_module
  *
  * \brief Template functor to add a scalar to a fixed other one
  *
  * \sa class CwiseUnaryOp, Array::operator+
  */
 template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_add_op;
 /** \internal
  * convenient macro to defined the return type of a cwise binary operation */
 #define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@ -247,21 +247,21 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
  *
  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
  */
-template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_multiple_op;
+/* NOTE why doing the ei_pset1() *is* an optimization ?
-
+ * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once
 * in the constructor. However, in practice:
 *  - GCC does not like m_other as a PacketScalar and generate a load every time it needs it
 *  - one the other hand GCC is able to moves the ei_pset1() away the loop :)
 *  - simpler code ;)
 * (ICC performs well in both cases)
 */
 template<typename Scalar>
-struct ei_scalar_multiple_op<Scalar,true> {
+struct ei_scalar_multiple_op {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
  inline ei_scalar_multiple_op(const Scalar& other) : m_other(ei_pset1(other)) { }
  inline Scalar operator() (const Scalar& a) const { return a * ei_pfirst(m_other); }
  inline const PacketScalar packetOp(const PacketScalar& a) const
  { return ei_pmul(a, m_other); }
  const PacketScalar m_other;
 };
 template<typename Scalar>
 struct ei_scalar_multiple_op<Scalar,false> {
  inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return a * m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
  { return ei_pmul(a, ei_pset1(m_other)); }
  const Scalar m_other;
 };
 template<typename Scalar>
@ -270,13 +270,16 @@ struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
 template<typename Scalar, bool HasFloatingPoint>
 struct ei_scalar_quotient1_impl {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
  inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
  inline Scalar operator() (const Scalar& a) const { return a * m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
  { return ei_pmul(a, ei_pset1(m_other)); }
  const Scalar m_other;
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,true> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; };
 template<typename Scalar>
 struct ei_scalar_quotient1_impl<Scalar,false> {
@ -305,22 +308,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
 // nullary functors
 template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;
 template<typename Scalar>
-struct ei_scalar_constant_op<Scalar,true> {
+struct ei_scalar_constant_op {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
  inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
  inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
  inline const PacketScalar packetOp() const
  { return m_other; }
  const PacketScalar m_other;
 };
 template<typename Scalar>
 struct ei_scalar_constant_op<Scalar,false> {
  inline ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { }
  inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
  inline const Scalar operator() (int, int = 0) const { return m_other; }
  inline const PacketScalar packetOp() const { return ei_pset1(m_other); }
  const Scalar m_other;
 };
 template<typename Scalar>
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -81,11 +81,12 @@ template<typename Scalar> struct ei_scalar_inverse_op;
 template<typename Scalar> struct ei_scalar_square_op;
 template<typename Scalar> struct ei_scalar_cube_op;
 template<typename Scalar, typename NewType> struct ei_scalar_cast_op;
-template<typename Scalar, bool PacketAccess> struct ei_scalar_multiple_op;
+template<typename Scalar> struct ei_scalar_multiple_op;
 template<typename Scalar> struct ei_scalar_quotient1_op;
 template<typename Scalar> struct ei_scalar_min_op;
 template<typename Scalar> struct ei_scalar_max_op;
 template<typename Scalar> struct ei_scalar_random_op;
 template<typename Scalar> struct ei_scalar_add_op;
 template<typename Scalar>
 void ei_cache_friendly_product(