Improve product kernel: replace the previous dynamic loop swaping strategy by a more general one:

It consists in increasing the actual number of rows of lhs's micro horizontal panel for small depth such that L1 cache is fully exploited.
2025-07-29 16:22:03 +08:00 · 2015-03-06 10:30:35 +01:00 · 2015-03-06 10:30:35 +01:00 · 58740ce4c6
commit 58740ce4c6
parent 5db2baa573
1 changed files with 37 additions and 50 deletions
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -230,6 +230,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
    {
      // So far, no blocking at all, i.e., kc==k, and nc==n.
      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
      Index problem_size = k*n*sizeof(LhsScalar);
      Index actual_lm = actual_l2;
      Index max_mc = m;
@ -952,32 +953,27 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
    // Usually, make sense only with FMA
    if(mr>=3*Traits::LhsProgress)
    {      
 #ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
      const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION;
 #else
      const bool swap_loops = depth<48;
 #endif
      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc3;
      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3;
      Index incr1  =  swap_loops ? nr : 3*Traits::LhsProgress;
      Index incr2  = !swap_loops ? nr : 3*Traits::LhsProgress;
      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
-      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
-      // and on each largest micro vertical panel of rhs (depth * nr)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
-      for(Index it1=0; it1<bound1; it1+=incr1)
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
      // This actual number of rows is computed as follow:
      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
      const Index actual_panel_rows = (3*LhsProgress) *  ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) );
      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
      {
-        for(Index it2=0; it2<bound2; it2+=incr2)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
        for(Index j2=0; j2<packet_cols4; j2+=nr)
        {
-          Index i  =  swap_loops ? it2 : it1;
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
-          Index j2 = !swap_loops ? it2 : it1;
+          {
-          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
          // stored into 3 x nr registers.
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
          prefetch(&blA[0]);
          // gets res block as register
@ -1110,15 +1106,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          r3.storePacket(0 * Traits::ResPacketSize, R0);
          r3.storePacket(1 * Traits::ResPacketSize, R1);
          r3.storePacket(2 * Traits::ResPacketSize, R2);          
          }
        }
      }
-      // Deal with remaining columns of the rhs
+        // Deal with remaining columns of the rhs
      if(packet_cols4<cols)
      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
      {
        for(Index j2=packet_cols4; j2<cols; j2++)
        {
          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
          {
          // One column at a time
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
          prefetch(&blA[0]);
@ -1190,6 +1185,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          r0.storePacket(0 * Traits::ResPacketSize, R0);
          r0.storePacket(1 * Traits::ResPacketSize, R1);
          r0.storePacket(2 * Traits::ResPacketSize, R2);          
          }
        }
      }
    }
@ -1197,26 +1193,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
    //---------- Process 2 * LhsProgress rows at once ----------
    if(mr>=2*Traits::LhsProgress)
    {
-#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
+      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
-      const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION);
+      Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) );
-#else
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
      const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48);
 #endif 
      Index start1 =  swap_loops ? 0 : peeled_mc3;
      Index start2 = !swap_loops ? 0 : peeled_mc3;
      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc2;
      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2;
      Index incr1  =  swap_loops ? nr : 2*Traits::LhsProgress;
      Index incr2  = !swap_loops ? nr : 2*Traits::LhsProgress;
      for(Index it1=start1; it1<bound1; it1+=incr1)
      {
-        for(Index it2=start2; it2<bound2; it2+=incr2)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
        for(Index j2=0; j2<packet_cols4; j2+=nr)
        {
-          Index i  =  swap_loops ? it2 : it1;
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
-          Index j2 = !swap_loops ? it2 : it1;
+          {
-          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
          // stored into 2 x nr registers.
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
@ -1320,15 +1307,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          r2.storePacket(1 * Traits::ResPacketSize, R1);
          r3.storePacket(0 * Traits::ResPacketSize, R2);
          r3.storePacket(1 * Traits::ResPacketSize, R3);
          }
        }
      }
-      // Deal with remaining columns of the rhs
+        // Deal with remaining columns of the rhs
      if(packet_cols4<cols)
      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*Traits::LhsProgress)
      {
        for(Index j2=packet_cols4; j2<cols; j2++)
        {
          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
          {
          // One column at a time
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
          prefetch(&blA[0]);
@ -1395,6 +1381,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          traits.acc(C4, alphav, R1);
          r0.storePacket(0 * Traits::ResPacketSize, R0);
          r0.storePacket(1 * Traits::ResPacketSize, R1);
          }
        }
      }
    }