give up on OpenMP... for now

2025-09-18 12:23:13 +08:00 · 2008-04-18 07:57:46 +00:00 · 2008-04-18 07:57:46 +00:00 · 6ae037dfb5
commit 6ae037dfb5
parent acfd6f3bda
6 changed files with 73 additions and 134 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -10,13 +10,6 @@
 #endif
 #endif

-#ifndef EIGEN_DONT_PARALLELIZE
-#ifdef _OPENMP
-#define EIGEN_USE_OPENMP
-#include <omp.h>
-#endif
-#endif
-
 #include <cstdlib>
 #include <cmath>
 #include <complex>
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -135,11 +135,6 @@ Derived& MatrixBase<Derived>
  }
 }

-template<typename T1, typename T2> bool ei_should_parallelize_assignment(const T1& t, const T2&)
-{
-  return (T1::Flags & T2::Flags & LargeBit) && t.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
-}
-
 template <typename Derived, typename OtherDerived>
 struct ei_assignment_impl<Derived, OtherDerived, false>
 {
@ -158,23 +153,17 @@ struct ei_assignment_impl<Derived, OtherDerived, false>
    {
      if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic)
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
-            for(int j = 0; j < dst.cols(); j++) \
-              for(int i = 0; i < dst.rows(); i++) \
-                dst.coeffRef(i, j) = src.coeff(i, j);
-        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
-        #undef EIGEN_THE_PARALLELIZABLE_LOOP
+        for(int j = 0; j < dst.cols(); j++)
+          for(int i = 0; i < dst.rows(); i++)
+            dst.coeffRef(i, j) = src.coeff(i, j);
      }
      else
      {
        // traverse in row-major order
        // in order to allow the compiler to unroll the inner loop
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
-          for(int i = 0; i < dst.rows(); i++) \
-            for(int j = 0; j < dst.cols(); j++) \
-              dst.coeffRef(i, j) = src.coeff(i, j);
-        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
-        #undef EIGEN_THE_PARALLELIZABLE_LOOP
+        for(int i = 0; i < dst.rows(); i++)
+          for(int j = 0; j < dst.cols(); j++)
+            dst.coeffRef(i, j) = src.coeff(i, j);
      }
    }
  }
@ -199,21 +188,15 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
    {
      if(OtherDerived::Flags&RowMajorBit)
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
-        for(int i = 0; i < dst.rows(); i++) \
-          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) \
+        for(int i = 0; i < dst.rows(); i++)
+          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
-        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
-        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
      else
      {
-        #define EIGEN_THE_PARALLELIZABLE_LOOP \
-        for(int j = 0; j < dst.cols(); j++) \
-          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) \
+        for(int j = 0; j < dst.cols(); j++)
+          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
-        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
-        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
    }
  }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -280,75 +280,67 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
 {
  res.setZero();
  const int cols4 = m_lhs.cols() & 0xfffffffC;
-  const bool should_parallelize = (Flags & DestDerived::Flags & LargeBit)
-                                && res.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
  #ifdef EIGEN_VECTORIZE
  if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
-  {
-    #define EIGEN_THE_PARALLELIZABLE_LOOP \
-      for(int k=0; k<this->cols(); k++) \
-      { \
-        int j=0; \
-        for(; j<cols4; j+=4) \
-        { \
-          const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); \
-          const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); \
-          const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); \
-          const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); \
-          for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) \
-          { \
-            res.writePacketCoeff(i,k,\
-              ei_padd( \
-                res.packetCoeff(i,k), \
-                ei_padd( \
-                  ei_padd( \
-                    ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), \
-                    ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), \
-                  ei_padd( \
-                    ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), \
-                    ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) \
-                  ) \
-                ) \
-              ) \
-            ); \
-          } \
-        } \
-        for(; j<m_lhs.cols(); ++j) \
-        { \
-          const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); \
-          for (int i=0; i<this->rows(); ++i) \
-            res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); \
-        } \
+  {    
+    for(int k=0; k<this->cols(); k++)
+    {
+      int j=0;
+      for(; j<cols4; j+=4)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
+        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
+        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
+        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
+        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+        {
+          res.writePacketCoeff(i,k,\
+            ei_padd(
+              res.packetCoeff(i,k),
+              ei_padd(
+                ei_padd(
+                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
+                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+                ei_padd(
+                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
+                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
+                )
+              )
+            )
+          );
+        }
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
-    #undef EIGEN_THE_PARALLELIZABLE_LOOP
+      for(; j<m_lhs.cols(); ++j)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
+        for (int i=0; i<this->rows(); ++i)
+          res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+      }
+    }
  }
  else
  #endif // EIGEN_VECTORIZE
  {
-    #define EIGEN_THE_PARALLELIZABLE_LOOP \
-      for(int k=0; k<this->cols(); ++k) \
-      { \
-        int j=0; \
-        for(; j<cols4; j+=4) \
-        { \
-          const Scalar tmp0 = m_rhs.coeff(j  ,k); \
-          const Scalar tmp1 = m_rhs.coeff(j+1,k); \
-          const Scalar tmp2 = m_rhs.coeff(j+2,k); \
-          const Scalar tmp3 = m_rhs.coeff(j+3,k); \
-          for (int i=0; i<this->rows(); ++i) \
-            res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) \
-                              + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); \
-        } \
-        for(; j<m_lhs.cols(); ++j) \
-        { \
-          const Scalar tmp = m_rhs.coeff(j,k); \
-          for (int i=0; i<this->rows(); ++i) \
-            res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); \
-        } \
+    for(int k=0; k<this->cols(); ++k)
+    {
+      int j=0;
+      for(; j<cols4; j+=4)
+      {
+        const Scalar tmp0 = m_rhs.coeff(j  ,k);
+        const Scalar tmp1 = m_rhs.coeff(j+1,k);
+        const Scalar tmp2 = m_rhs.coeff(j+2,k);
+        const Scalar tmp3 = m_rhs.coeff(j+3,k);
+        for (int i=0; i<this->rows(); ++i)
+          res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
+                            + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
-    #undef EIGEN_THE_PARALLELIZABLE_LOOP
+      for(; j<m_lhs.cols(); ++j)
+      {
+        const Scalar tmp = m_rhs.coeff(j,k);
+        for (int i=0; i<this->rows(); ++i)
+          res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
+      }
+    }
  }
 }

--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -37,10 +37,6 @@
 #define EIGEN_UNROLLING_LIMIT 400
 #endif

-#ifndef EIGEN_PARALLELIZATION_TRESHOLD
-#define EIGEN_PARALLELIZATION_TRESHOLD 2000
-#endif
-
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER RowMajorBit
 #else
@ -78,30 +74,6 @@ using Eigen::MatrixBase;
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif

-#ifdef EIGEN_USE_OPENMP
-# ifdef __INTEL_COMPILER
-#   define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none) shared(other)")
-# else
-#   define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none)")
-# endif
-# define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) \
-  if(condition) \
-  { \
-    EIGEN_PRAGMA_OMP_PARALLEL \
-    { \
-      _Pragma("omp for") \
-      EIGEN_THE_PARALLELIZABLE_LOOP \
-    } \
-  } \
-  else \
-  { \
-    EIGEN_THE_PARALLELIZABLE_LOOP \
-  }
-#else // EIGEN_USE_OPENMP
-# define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) EIGEN_THE_PARALLELIZABLE_LOOP
-#endif
-
-
 // FIXME with the always_inline attribute,
 // gcc 3.4.x reports the following compilation error:
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@ -92,7 +92,6 @@ template<typename MatrixType, bool CheckExistence> class Inverse : ei_no_assignm
    enum { _Size = MatrixType::RowsAtCompileTime };
    void _compute(const MatrixType& matrix);
    void _compute_in_general_case(const MatrixType& matrix);
-    void _compute_in_size1_case(const MatrixType& matrix);
    void _compute_in_size2_case(const MatrixType& matrix);
    void _compute_in_size3_case(const MatrixType& matrix);
    void _compute_in_size4_case(const MatrixType& matrix);
--- a/bench/benchmarkXcwise.cpp
+++ b/bench/benchmarkXcwise.cpp
@ -5,12 +5,12 @@
 using namespace std;
 USING_PART_OF_NAMESPACE_EIGEN

-#ifndef MATTYPE
-#define MATTYPE MatrixXLd
+#ifndef VECTYPE
+#define VECTYPE VectorXLd
 #endif

-#ifndef MATSIZE
-#define MATSIZE 1000000
+#ifndef VECSIZE
+#define VECSIZE 1000000
 #endif

 #ifndef REPEAT
@ -19,16 +19,16 @@ USING_PART_OF_NAMESPACE_EIGEN

 int main(int argc, char *argv[])
 {
-	MATTYPE I = MATTYPE::ones(MATSIZE,1);
-	MATTYPE m(MATSIZE,1);
-	for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < 1; j++)
+	VECTYPE I = VECTYPE::ones(VECSIZE);
+	VECTYPE m(VECSIZE,1);
+	for(int i = 0; i < VECSIZE; i++)
 	{
-		m(i,j) = 0.1 * (i+j+1)/MATSIZE/MATSIZE;
+		m[i] = 0.1 * i/VECSIZE;
 	}
 	for(int a = 0; a < REPEAT; a++)
 	{
-		m = MATTYPE::ones(MATSIZE,1) + 0.00005 * (m.cwiseProduct(m) + m/4);
+		m = VECTYPE::ones(VECSIZE) + 0.00005 * (m.cwiseProduct(m) + m/4);
 	}
-	cout << m(0,0) << endl;
+	cout << m[0] << endl;
 	return 0;
 }