give up on OpenMP... for now

2025-07-09 22:51:51 +08:00 · 2008-04-18 07:57:46 +00:00 · 2008-04-18 07:57:46 +00:00 · 6ae037dfb5
commit 6ae037dfb5
parent acfd6f3bda
6 changed files with 73 additions and 134 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -10,13 +10,6 @@
 #endif
 #endif
 #ifndef EIGEN_DONT_PARALLELIZE
 #ifdef _OPENMP
 #define EIGEN_USE_OPENMP
 #include <omp.h>
 #endif
 #endif
 #include <cstdlib>
 #include <cmath>
 #include <complex>
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -135,11 +135,6 @@ Derived& MatrixBase<Derived>
  }
 }
 template<typename T1, typename T2> bool ei_should_parallelize_assignment(const T1& t, const T2&)
 {
  return (T1::Flags & T2::Flags & LargeBit) && t.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
 }
 template <typename Derived, typename OtherDerived>
 struct ei_assignment_impl<Derived, OtherDerived, false>
 {
@ -158,23 +153,17 @@ struct ei_assignment_impl<Derived, OtherDerived, false>
    {
      if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic)
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
+        for(int j = 0; j < dst.cols(); j++)
-            for(int j = 0; j < dst.cols(); j++) \
+          for(int i = 0; i < dst.rows(); i++)
-              for(int i = 0; i < dst.rows(); i++) \
+            dst.coeffRef(i, j) = src.coeff(i, j);
                dst.coeffRef(i, j) = src.coeff(i, j);
        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
      else
      {
        // traverse in row-major order
        // in order to allow the compiler to unroll the inner loop
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
+        for(int i = 0; i < dst.rows(); i++)
-          for(int i = 0; i < dst.rows(); i++) \
+          for(int j = 0; j < dst.cols(); j++)
-            for(int j = 0; j < dst.cols(); j++) \
+            dst.coeffRef(i, j) = src.coeff(i, j);
              dst.coeffRef(i, j) = src.coeff(i, j);
        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
    }
  }
@ -199,21 +188,15 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
    {
      if(OtherDerived::Flags&RowMajorBit)
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
+        for(int i = 0; i < dst.rows(); i++)
-        for(int i = 0; i < dst.rows(); i++) \
+          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) \
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
      else
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
+        for(int j = 0; j < dst.cols(); j++)
-        for(int j = 0; j < dst.cols(); j++) \
+          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) \
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
    }
  }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -280,75 +280,67 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
 {
  res.setZero();
  const int cols4 = m_lhs.cols() & 0xfffffffC;
  const bool should_parallelize = (Flags & DestDerived::Flags & LargeBit)
                                && res.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
  #ifdef EIGEN_VECTORIZE
  if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
-  {
+  {    
-    #define EIGEN_THE_PARALLELIZABLE_LOOP \
+    for(int k=0; k<this->cols(); k++)
-      for(int k=0; k<this->cols(); k++) \
+    {
-      { \
+      int j=0;
-        int j=0; \
+      for(; j<cols4; j+=4)
-        for(; j<cols4; j+=4) \
+      {
-        { \
+        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
-          const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); \
+        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
-          const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); \
+        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
-          const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); \
+        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
-          const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); \
+        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-          for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) \
+        {
-          { \
+          res.writePacketCoeff(i,k,\
-            res.writePacketCoeff(i,k,\
+            ei_padd(
-              ei_padd( \
+              res.packetCoeff(i,k),
-                res.packetCoeff(i,k), \
+              ei_padd(
-                ei_padd( \
+                ei_padd(
-                  ei_padd( \
+                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
-                    ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), \
+                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
-                    ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), \
+                ei_padd(
-                  ei_padd( \
+                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
-                    ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), \
+                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
-                    ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) \
+                )
-                  ) \
+              )
-                ) \
+            )
-              ) \
+          );
-            ); \
+        }
          } \
        } \
        for(; j<m_lhs.cols(); ++j) \
        { \
          const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); \
          for (int i=0; i<this->rows(); ++i) \
            res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); \
        } \
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
+      for(; j<m_lhs.cols(); ++j)
-    #undef EIGEN_THE_PARALLELIZABLE_LOOP
+      {
        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
        for (int i=0; i<this->rows(); ++i)
          res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
      }
    }
  }
  else
  #endif // EIGEN_VECTORIZE
  {
-    #define EIGEN_THE_PARALLELIZABLE_LOOP \
+    for(int k=0; k<this->cols(); ++k)
-      for(int k=0; k<this->cols(); ++k) \
+    {
-      { \
+      int j=0;
-        int j=0; \
+      for(; j<cols4; j+=4)
-        for(; j<cols4; j+=4) \
+      {
-        { \
+        const Scalar tmp0 = m_rhs.coeff(j  ,k);
-          const Scalar tmp0 = m_rhs.coeff(j  ,k); \
+        const Scalar tmp1 = m_rhs.coeff(j+1,k);
-          const Scalar tmp1 = m_rhs.coeff(j+1,k); \
+        const Scalar tmp2 = m_rhs.coeff(j+2,k);
-          const Scalar tmp2 = m_rhs.coeff(j+2,k); \
+        const Scalar tmp3 = m_rhs.coeff(j+3,k);
-          const Scalar tmp3 = m_rhs.coeff(j+3,k); \
+        for (int i=0; i<this->rows(); ++i)
-          for (int i=0; i<this->rows(); ++i) \
+          res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
-            res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) \
+                            + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
                              + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); \
        } \
        for(; j<m_lhs.cols(); ++j) \
        { \
          const Scalar tmp = m_rhs.coeff(j,k); \
          for (int i=0; i<this->rows(); ++i) \
            res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); \
        } \
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
+      for(; j<m_lhs.cols(); ++j)
-    #undef EIGEN_THE_PARALLELIZABLE_LOOP
+      {
        const Scalar tmp = m_rhs.coeff(j,k);
        for (int i=0; i<this->rows(); ++i)
          res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
      }
    }
  }
 }
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -37,10 +37,6 @@
 #define EIGEN_UNROLLING_LIMIT 400
 #endif
 #ifndef EIGEN_PARALLELIZATION_TRESHOLD
 #define EIGEN_PARALLELIZATION_TRESHOLD 2000
 #endif
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER RowMajorBit
 #else
@ -78,30 +74,6 @@ using Eigen::MatrixBase;
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif
 #ifdef EIGEN_USE_OPENMP
 # ifdef __INTEL_COMPILER
 #   define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none) shared(other)")
 # else
 #   define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none)")
 # endif
 # define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) \
  if(condition) \
  { \
    EIGEN_PRAGMA_OMP_PARALLEL \
    { \
      _Pragma("omp for") \
      EIGEN_THE_PARALLELIZABLE_LOOP \
    } \
  } \
  else \
  { \
    EIGEN_THE_PARALLELIZABLE_LOOP \
  }
 #else // EIGEN_USE_OPENMP
 # define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) EIGEN_THE_PARALLELIZABLE_LOOP
 #endif
 // FIXME with the always_inline attribute,
 // gcc 3.4.x reports the following compilation error:
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@ -92,7 +92,6 @@ template<typename MatrixType, bool CheckExistence> class Inverse : ei_no_assignm
    enum { _Size = MatrixType::RowsAtCompileTime };
    void _compute(const MatrixType& matrix);
    void _compute_in_general_case(const MatrixType& matrix);
    void _compute_in_size1_case(const MatrixType& matrix);
    void _compute_in_size2_case(const MatrixType& matrix);
    void _compute_in_size3_case(const MatrixType& matrix);
    void _compute_in_size4_case(const MatrixType& matrix);
--- a/bench/benchmarkXcwise.cpp
+++ b/bench/benchmarkXcwise.cpp
@ -5,12 +5,12 @@
 using namespace std;
 USING_PART_OF_NAMESPACE_EIGEN
-#ifndef MATTYPE
+#ifndef VECTYPE
-#define MATTYPE MatrixXLd
+#define VECTYPE VectorXLd
 #endif
-#ifndef MATSIZE
+#ifndef VECSIZE
-#define MATSIZE 1000000
+#define VECSIZE 1000000
 #endif
 #ifndef REPEAT
@ -19,16 +19,16 @@ USING_PART_OF_NAMESPACE_EIGEN
 int main(int argc, char *argv[])
 {
-	MATTYPE I = MATTYPE::ones(MATSIZE,1);
+	VECTYPE I = VECTYPE::ones(VECSIZE);
-	MATTYPE m(MATSIZE,1);
+	VECTYPE m(VECSIZE,1);
-	for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < 1; j++)
+	for(int i = 0; i < VECSIZE; i++)
 	{
-		m(i,j) = 0.1 * (i+j+1)/MATSIZE/MATSIZE;
+		m[i] = 0.1 * i/VECSIZE;
 	}
 	for(int a = 0; a < REPEAT; a++)
 	{
-		m = MATTYPE::ones(MATSIZE,1) + 0.00005 * (m.cwiseProduct(m) + m/4);
+		m = VECTYPE::ones(VECSIZE) + 0.00005 * (m.cwiseProduct(m) + m/4);
 	}
-	cout << m(0,0) << endl;
+	cout << m[0] << endl;
 	return 0;
 }