remove Qt's atomic dependency, I don't know what I was doing wrong...

2025-09-26 00:03:14 +08:00 · 2010-03-01 13:09:47 +01:00 · 2010-03-01 13:09:47 +01:00 · 1710c07f63
commit 1710c07f63
parent 31aa17e4ef
3 changed files with 12 additions and 12 deletions
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -98,7 +98,6 @@ static void run(int rows, int cols, int depth,

    // if you have the GOTO blas library you can try our parallelization strategy
    // using GOTO's optimized routines.
-//     #define USEGOTOROUTINES
    #ifdef USEGOTOROUTINES
    void* u = alloca(4096+sizeW);
    #endif
@ -125,7 +124,8 @@ static void run(int rows, int cols, int depth,
      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
      // i.e., we test that info[tid].users equals 0.
      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
-      while(!info[tid].users.testAndSetOrdered(0,threads)) {}
+      while(info[tid].users!=0) {}
+      info[tid].users += threads;

      #ifndef USEGOTOROUTINES
      pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
@ -134,7 +134,7 @@ static void run(int rows, int cols, int depth,
      #endif

      // Notify the other threads that the part B'_j is ready to go.
-      info[tid].sync.fetchAndStoreOrdered(k);
+      info[tid].sync = k;

      // Computes C_i += A' * B' per B'_j
      for(int shift=0; shift<threads; ++shift)
@ -145,7 +145,7 @@ static void run(int rows, int cols, int depth,
        // we use testAndSetOrdered to mimic a volatile access.
        // However, no need to wait for the B' part which has been updated by the current thread!
        if(shift>0)
-          while(!info[j].sync.testAndSetOrdered(k,k)) {}
+          while(info[j].sync!=k) {}

        #ifndef USEGOTOROUTINES
        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
@ -178,7 +178,8 @@ static void run(int rows, int cols, int depth,
      // Release all the sub blocks B'_j of B' for the current thread,
      // i.e., we simply decrement the number of users by 1
      for(int j=0; j<threads; ++j)
-        info[j].users.fetchAndAddOrdered(-1);
+        #pragma omp atomic
+        --(info[j].users);
    }

    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@ -92,8 +92,11 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)

 struct GemmParallelInfo
 {
-  QAtomicInt sync;
-  QAtomicInt users;
+  GemmParallelInfo() : sync(-1), users(0) {}
+
+  int volatile sync;
+  int volatile users;
+
  int rhs_start;
  int rhs_length;
  float* blockB;
@ -118,7 +121,7 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)

  GemmParallelInfo* info = new GemmParallelInfo[threads];

-  #pragma omp parallel for schedule(static,1)
+  #pragma omp parallel for schedule(static,1) shared(info)
  for(int i=0; i<threads; ++i)
  {
    int r0 = i*blockRows;
@ -130,8 +133,6 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
    info[i].rhs_start = c0;
    info[i].rhs_length = actualBlockCols;
    info[i].blockB = sharedBlockB;
-    info[i].sync.fetchAndStoreOrdered(-1);
-    info[i].users.fetchAndStoreOrdered(0);

    func(r0, actualBlockRows, 0,cols, info);
  }
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@ -2,8 +2,6 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out

-#include <QAtomicInt>
-
 #include <Eigen/Core>

 #include <bench/BenchTimer.h>