split sum away from redux and vectorize it.

(could come back to redux after it has been vectorized, and could serve as a starting point for that) also make the abs2 functor vectorizable (for real types).
2025-09-17 20:03:17 +08:00 · 2008-06-23 10:32:48 +00:00 · 2008-06-23 10:32:48 +00:00 · dc9206cec5
commit dc9206cec5
parent 8a967fb17c
6 changed files with 301 additions and 39 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -48,6 +48,7 @@ namespace Eigen {
 #include "src/Core/Dot.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/DiagonalCoeffs.h"
+#include "src/Core/Sum.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -31,19 +31,6 @@
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/

-enum {
-  NoVectorization,
-  InnerVectorization,
-  LinearVectorization,
-  SliceVectorization
-};
-
-enum {
-  CompleteUnrolling,
-  InnerUnrolling,
-  NoUnrolling
-};
-
 template <typename Derived, typename OtherDerived>
 struct ei_assign_traits
 {
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@ -173,10 +173,13 @@ struct ei_functor_traits<ei_scalar_abs_op<Scalar> >
 template<typename Scalar> struct ei_scalar_abs2_op EIGEN_EMPTY_STRUCT {
  typedef typename NumTraits<Scalar>::Real result_type;
  inline const result_type operator() (const Scalar& a) const { return ei_abs2(a); }
+  template<typename PacketScalar>
+  inline const PacketScalar packetOp(const PacketScalar& a) const
+  { return ei_pmul(a,a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = NumTraits<Scalar>::IsComplex==false && int(ei_packet_traits<Scalar>::size)>1 }; };

 /** \internal
  * \brief Template functor to compute the conjugate of a complex value
@ -223,7 +226,7 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
  *
  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
  */
-template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_multiple_op;
+template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_multiple_op;

 template<typename Scalar>
 struct ei_scalar_multiple_op<Scalar,true> {
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -96,30 +96,6 @@ MatrixBase<Derived>::redux(const BinaryOp& func) const
          ::run(derived(), func);
 }

-/** \returns the sum of all coefficients of *this
-  *
-  * \sa trace()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::sum() const
-{
-  return this->redux(Eigen::ei_scalar_sum_op<Scalar>());
-}
-
-/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
-  *
-  * \c *this can be any matrix, not necessarily square.
-  *
-  * \sa diagonal(), sum()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::trace() const
-{
-  return diagonal().sum();
-}
-
 /** \returns the minimum of all coefficients of *this
  */
 template<typename Derived>
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@ -0,0 +1,282 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SUM_H
+#define EIGEN_SUM_H
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+template<typename Derived>
+struct ei_sum_traits
+{
+public:
+  enum {
+    Vectorization = (int(Derived::Flags)&PacketAccessBit)
+                 && (int(Derived::Flags)&LinearAccessBit)
+                  ? LinearVectorization
+                  : NoVectorization
+  };
+
+private:
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
+    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
+           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
+    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
+  };
+
+public:
+  enum {
+    Unrolling = Cost <= UnrollingLimit
+              ? CompleteUnrolling
+              : NoUnrolling
+  };       
+};
+
+/***************************************************************************
+* Part 2 : unrollers
+***************************************************************************/
+
+/*** no vectorization ***/
+ 
+template<typename Derived, int Start, int Length>
+struct ei_sum_novec_unroller
+{
+  enum {
+    HalfLength = Length/2
+  };
+
+  typedef typename Derived::Scalar Scalar;
+
+  inline static Scalar run(const Derived &mat)
+  {
+    return ei_sum_novec_unroller<Derived, Start, HalfLength>::run(mat)
+         + ei_sum_novec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat);
+  }
+};
+
+template<typename Derived, int Start>
+struct ei_sum_novec_unroller<Derived, Start, 1>
+{
+  enum {
+    col = Start / Derived::RowsAtCompileTime,
+    row = Start % Derived::RowsAtCompileTime
+  };
+
+  typedef typename Derived::Scalar Scalar;
+
+  inline static Scalar run(const Derived &mat)
+  {
+    return mat.coeff(row, col);
+  }
+};
+
+/*** vectorization ***/
+
+template<typename Derived, int Index, int Stop,
+         bool LastPacket = (Stop-Index == ei_packet_traits<typename Derived::Scalar>::size)>
+struct ei_sum_vec_unroller
+{
+  enum {
+    row = int(Derived::Flags)&RowMajorBit
+        ? Index / int(Derived::ColsAtCompileTime)
+        : Index % Derived::RowsAtCompileTime,
+    col = int(Derived::Flags)&RowMajorBit
+        ? Index % int(Derived::ColsAtCompileTime)
+        : Index / Derived::RowsAtCompileTime
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  inline static PacketScalar run(const Derived &mat)
+  {
+    return ei_padd(
+      mat.template packet<Aligned>(row, col),
+      ei_sum_vec_unroller<Derived, Index+ei_packet_traits<typename Derived::Scalar>::size, Stop>::run(mat)
+    );
+  }
+};
+
+template<typename Derived, int Index, int Stop>
+struct ei_sum_vec_unroller<Derived, Index, Stop, true>
+{
+  enum {
+    row = int(Derived::Flags)&RowMajorBit
+        ? Index / int(Derived::ColsAtCompileTime)
+        : Index % Derived::RowsAtCompileTime,
+    col = int(Derived::Flags)&RowMajorBit
+        ? Index % int(Derived::ColsAtCompileTime)
+        : Index / Derived::RowsAtCompileTime
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  inline static PacketScalar run(const Derived &mat)
+  {
+    return mat.template packet<Aligned>(row, col);
+  }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Derived,
+         int Vectorization = ei_sum_traits<Derived>::Vectorization,
+         int Unrolling = ei_sum_traits<Derived>::Unrolling
+>
+struct ei_sum_impl;
+
+template<typename Derived>
+struct ei_sum_impl<Derived, NoVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  static Scalar run(const Derived& mat)
+  {
+    Scalar res;
+    res = mat.coeff(0, 0);
+    for(int i = 1; i < mat.rows(); i++)
+      res += mat.coeff(i, 0);
+    for(int j = 1; j < mat.cols(); j++)
+      for(int i = 0; i < mat.rows(); i++)
+        res += mat.coeff(i, j);
+    return res;
+  }
+};
+
+template<typename Derived>
+struct ei_sum_impl<Derived, NoVectorization, CompleteUnrolling>
+  : public ei_sum_novec_unroller<Derived, 0, Derived::SizeAtCompileTime>
+{};
+
+template<typename Derived>
+struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  static Scalar run(const Derived& mat)
+  {
+    const int size = mat.size();
+    const int packetSize = ei_packet_traits<typename Derived::Scalar>::size;
+    const int alignedSize = (size/packetSize)*packetSize;
+    const bool rowMajor = Derived::Flags&RowMajorBit;
+    const int innerSize = rowMajor ? mat.cols() : mat.rows();
+    const int outerSize = rowMajor ? mat.rows() : mat.cols();
+    Scalar res;
+
+    // do the vectorizable part of the sum
+    if(size >= packetSize)
+    {
+      PacketScalar packet_res;
+      packet_res = mat.template packet<Aligned>(0, 0);
+      int index;
+      for(index = packetSize; index<alignedSize ; index+=packetSize)
+      {
+        // FIXME the following is not really efficient
+        const int row = rowMajor ? index/innerSize : index%innerSize;
+        const int col = rowMajor ? index%innerSize : index/innerSize;
+        packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
+      }
+      res = ei_predux(packet_res);
+
+      // now we must do the rest without vectorization.
+      if(alignedSize == size) return res;
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway, don't try to optimize.
+    {
+      res = Scalar(0);
+    }
+
+    const int k = alignedSize/innerSize;
+
+    // do the remainder of the current row or col
+    for(int i = alignedSize%innerSize; i < innerSize; i++)
+    {
+      const int row = rowMajor ? k : i;
+      const int col = rowMajor ? i : k;
+      res += mat.coeff(row, col);
+    }
+
+    // do the remaining rows or cols
+    for(int j = k+1; j < outerSize; j++)
+      for(int i = 0; i < innerSize; i++)
+      {
+        const int row = rowMajor ? i : j;
+        const int col = rowMajor ? j : i;
+        res += mat.coeff(row, col);
+      }
+
+    return res;
+  }
+};
+
+template<typename Derived>
+struct ei_sum_impl<Derived, LinearVectorization, CompleteUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  static Scalar run(const Derived& mat)
+  {
+    return ei_predux(
+      ei_sum_vec_unroller<Derived, 0, Derived::SizeAtCompileTime>::run(mat)
+    );
+  }
+};
+
+/***************************************************************************
+* Part 4 : implementation of MatrixBase methods
+***************************************************************************/
+
+/** \returns the sum of all coefficients of *this
+  *
+  * \sa trace()
+  */
+template<typename Derived>
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::sum() const
+{
+  return ei_sum_impl<Derived>::run(derived());
+}
+
+/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
+  *
+  * \c *this can be any matrix, not necessarily square.
+  *
+  * \sa diagonal(), sum()
+  */
+template<typename Derived>
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
+{
+  return diagonal().sum();
+}
+
+
+#endif // EIGEN_SUM_H
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -142,5 +142,18 @@ enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
 enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct };

+enum {
+  NoVectorization,
+  InnerVectorization,
+  LinearVectorization,
+  SliceVectorization
+};
+
+enum {
+  CompleteUnrolling,
+  InnerUnrolling,
+  NoUnrolling
+};
+

 #endif // EIGEN_CONSTANTS_H