trmm is now working in all storage order configurations

2025-09-15 02:43:14 +08:00 · 2009-07-27 10:27:01 +02:00 · 2009-07-27 10:27:01 +02:00 · 6aba84719d
commit 6aba84719d
parent 1d4d9a37fd
4 changed files with 37 additions and 71 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -189,6 +189,7 @@ namespace Eigen {
 #include "src/Core/products/SelfadjointRank2Update.h"
 #include "src/Core/products/TriangularMatrixVector.h"
 #include "src/Core/products/TriangularSolverMatrix.h"
 #include "src/Core/products/TriangularMatrixMatrix.h"
 #include "src/Core/BandMatrix.h"
 } // namespace Eigen
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -125,7 +125,7 @@ struct ei_gebp_kernel
      // loops on each register blocking of lhs/res
      for(int i=0; i<peeled_mc; i+=mr)
      {
-        const Scalar* blA = &blockA[i*strideA];
+        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
        #ifdef EIGEN_VECTORIZE_SSE
        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
        #endif
@ -248,7 +248,7 @@ struct ei_gebp_kernel
      }
      for(int i=peeled_mc; i<rows; i++)
      {
-        const Scalar* blA = &blockA[i*strideA];
+        const Scalar* blA = &blockA[i*strideA+offsetA];
        #ifdef EIGEN_VECTORIZE_SSE
        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
        #endif
@ -285,7 +285,7 @@ struct ei_gebp_kernel
    {
      for(int i=0; i<peeled_mc; i+=mr)
      {
-        const Scalar* blA = &blockA[i*strideA];
+        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
        #ifdef EIGEN_VECTORIZE_SSE
        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
        #endif
@ -317,7 +317,7 @@ struct ei_gebp_kernel
      }
      for(int i=peeled_mc; i<rows; i++)
      {
-        const Scalar* blA = &blockA[i*strideA];
+        const Scalar* blA = &blockA[i*strideA+offsetA];
        #ifdef EIGEN_VECTORIZE_SSE
        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
        #endif
@ -375,17 +375,21 @@ struct ei_gemm_pack_lhs
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, int nr>
+template<typename Scalar, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
+struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
 {
  enum { PacketSize = ei_packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
+  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
                  int stride=0, int offset=0)
  {
    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
    bool hasAlpha = alpha != Scalar(1);
    int packet_cols = (cols/nr) * nr;
    int count = 0;
    for(int j2=0; j2<packet_cols; j2+=nr)
    {
      // skip what we have before
      if(PanelMode) count += PacketSize * nr * offset;
      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
@ -418,10 +422,13 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
          count += nr*PacketSize;
        }
      }
      // skip what we have after
      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
    }
    // copy the remaining columns one at a time (nr==1)
    for(int j2=packet_cols; j2<cols; ++j2)
    {
      if(PanelMode) count += PacketSize * offset;
      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
      if (hasAlpha)
      {
@ -439,34 +446,36 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
          count += PacketSize;
        }
      }
      if(PanelMode) count += PacketSize * (stride-offset-depth);
    }
  }
 };
 // this version is optimized for row major matrices
-template<typename Scalar, int nr>
+template<typename Scalar, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
+struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
 {
  enum { PacketSize = ei_packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
+  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
                  int stride=0, int offset=0)
  {
    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
    bool hasAlpha = alpha != Scalar(1);
    int packet_cols = (cols/nr) * nr;
    int count = 0;
    for(int j2=0; j2<packet_cols; j2+=nr)
    {
      // skip what we have before
      if(PanelMode) count += PacketSize * nr * offset;
      if (hasAlpha)
      {
        for(int k=0; k<depth; k++)
        {
          const Scalar* b0 = &rhs[k*rhsStride + j2];
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
+                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
+                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
-          if (nr==4)
+          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
-          {
+          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
          }
          count += nr*PacketSize;
        }
      }
@ -475,26 +484,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
        for(int k=0; k<depth; k++)
        {
          const Scalar* b0 = &rhs[k*rhsStride + j2];
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
+                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
+                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
-          if (nr==4)
+          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
-          {
+          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
          }
          count += nr*PacketSize;
        }
      }
      // skip what we have after
      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
    }
    // copy the remaining columns one at a time (nr==1)
    for(int j2=packet_cols; j2<cols; ++j2)
    {
      if(PanelMode) count += PacketSize * offset;
      const Scalar* b0 = &rhs[j2];
      for(int k=0; k<depth; k++)
      {
        ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
        count += PacketSize;
      }
      if(PanelMode) count += PacketSize * (stride-offset-depth);
    }
  }
 };
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@ -25,9 +25,6 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
 #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
 template<typename Scalar, int nr>
 struct ei_gemm_pack_rhs_panel;
 // if the rhs is row major, we have to evaluate it in a temporary colmajor matrix
 template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode>
 struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode>
@ -136,7 +133,7 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
          int blockBOffset = IsLowerTriangular ? k1 : lengthTarget;
          // update the respective rows of B from rhs
-          ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>()
+          ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>()
            (blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
          // GEBP
@ -174,46 +171,4 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M
  }
 };
 template<typename Scalar, int nr>
 struct ei_gemm_pack_rhs_panel
 {
  enum { PacketSize = ei_packet_traits<Scalar>::size };
  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset)
  {
    int packet_cols = (cols/nr) * nr;
    int count = 0;
    for(int j2=0; j2<packet_cols; j2+=nr)
    {
      // skip what we have before
      count += PacketSize * nr * offset;
      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
      for(int k=0; k<depth; k++)
      {
                  ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
                  ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
        if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
        if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
        count += nr*PacketSize;
      }
      // skip what we have after
      count += PacketSize * nr * (stride-offset-depth);
    }
    // copy the remaining columns one at a time (nr==1)
    for(int j2=packet_cols; j2<cols; ++j2)
    {
      count += PacketSize * offset;
      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
      for(int k=0; k<depth; k++)
      {
        ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
        count += PacketSize;
      }
      count += PacketSize * (stride-offset-depth);
    }
  }
 };
 #endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -32,7 +32,7 @@
 template<typename Scalar, int mr, int nr, typename Conj>
 struct ei_gebp_kernel;
-template<typename Scalar, int nr, int StorageOrder>
+template<typename Scalar, int nr, int StorageOrder, bool PanelMode=false>
 struct ei_gemm_pack_rhs;
 template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false>