* generalize rowmajor by vector

* fix weird compilation error when constructing a matrix with a row by matrix product
2025-09-12 09:23:12 +08:00 · 2010-07-10 22:53:27 +02:00 · 2010-07-10 22:53:27 +02:00 · e5bc9526f1
commit e5bc9526f1
parent c4ef69b5bd
6 changed files with 57 additions and 34 deletions
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -324,6 +324,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
    typedef typename ProductType::ActualRhsType ActualRhsType;
    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;

    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
@ -342,7 +343,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
    else
    {
      actualDest = ei_aligned_stack_new(Scalar,dest.size());
-      Map<typename Dest::PlainObject>(actualDest, dest.size()) = dest;
+      MappedDest(actualDest, dest.size()) = dest;
    }

    ei_cache_friendly_product_colmajor_times_vector
@ -353,7 +354,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>

    if (!EvalToDest)
    {
-      dest = Map<typename Dest::PlainObject>(actualDest, dest.size());
+      dest = MappedDest(actualDest, dest.size());
      ei_aligned_stack_delete(Scalar, actualDest, dest.size());
    }
  }
@ -365,6 +366,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
  static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
  {
    typedef typename ProductType::Scalar Scalar;
+    typedef typename ProductType::Index Index;
    typedef typename ProductType::ActualLhsType ActualLhsType;
    typedef typename ProductType::ActualRhsType ActualRhsType;
    typedef typename ProductType::_ActualRhsType _ActualRhsType;
@ -394,9 +396,12 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
    }

    ei_cache_friendly_product_rowmajor_times_vector
-      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
+      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
+        actualLhs.rows(), actualLhs.cols(),
        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
-        rhs_data, prod.rhs().size(), dest, actualAlpha);
+        rhs_data, 1,
+        &dest.coeffRef(0,0), dest.innerStride(),
+        actualAlpha);

    if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
  }
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@ -80,12 +80,13 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
        // 2 - it is slighlty faster at runtime
        Index startRow = IsLower ? pi : pi-actualPanelWidth;
        Index startCol = IsLower ? 0 : pi;
-        VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);

-        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
+        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
+          actualPanelWidth, r,
          &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
-          &(other.coeffRef(startCol)), r,
-          target, Scalar(-1));
+          &(other.coeffRef(startCol)), other.innerStride(),
+          &other.coeffRef(startRow), other.innerStride(),
+          Scalar(-1));
      }

      for(Index k=0; k<actualPanelWidth; ++k)
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@ -258,13 +258,15 @@ void ei_cache_friendly_product_colmajor_times_vector(
 }

 // TODO add peeling to mask unaligned load/stores
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
+  Index rows, Index cols,
  const Scalar* lhs, Index lhsStride,
-  const Scalar* rhs, Index rhsSize,
-  ResType& res,
+  const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr,
  Scalar alpha)
 {
+  ei_internal_assert(rhsIncr==1);
  #ifdef _EIGEN_ACCUMULATE_PACKETS
  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
  #endif
@ -291,22 +293,22 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
  const Index peels = 2;
  const Index PacketAlignedMask = PacketSize-1;
  const Index PeelAlignedMask = PacketSize*peels-1;
-  const Index size = rhsSize;
+  const Index depth = cols;

  // How many coeffs of the result do we have to skip to be aligned.
  // Here we assume data are at least aligned on the base scalar type
  // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = ei_first_aligned(rhs, size);
-  Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  Index alignedStart = ei_first_aligned(rhs, depth);
+  Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
  const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;

  const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(PacketSize/2) ? EvenAligned
-                       : FirstAligned;
+                         : alignmentStep==(PacketSize/2) ? EvenAligned
+                         : FirstAligned;

  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = ei_first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = ei_first_aligned(lhs,depth);

  // find how many rows do we have to skip to be aligned with rhs (if possible)
  Index skipRows = 0;
@ -318,7 +320,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
  }
  else if (PacketSize>1)
  {
-    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || size<PacketSize);
+    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || depth<PacketSize);

    while (skipRows<PacketSize &&
           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
@ -331,26 +333,26 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
    }
    else
    {
-      skipRows = std::min(skipRows,Index(res.size()));
+      skipRows = std::min(skipRows,Index(rows));
      // note that the skiped columns are processed later.
    }
    ei_internal_assert(  alignmentPattern==NoneAligned
                      || PacketSize==1
-                      || (skipRows + rowsAtOnce >= res.size())
-                      || PacketSize > rhsSize
+                      || (skipRows + rowsAtOnce >= rows)
+                      || PacketSize > depth
                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
  }
  else if(Vectorizable)
  {
    alignedStart = 0;
-    alignedSize = size;
+    alignedSize = depth;
    alignmentPattern = AllAligned;
  }

  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
  Index offset3 = (FirstAligned && alignmentStep==1?1:3);

-  Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
    EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
@ -439,17 +441,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(

    // process remaining coeffs (or all if no explicit vectorization)
    // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<size; ++j)
+    for (Index j=alignedSize; j<depth; ++j)
    {
      Scalar b = rhs[j];
      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
    }
-    res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3;
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
  }

  // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = res.size();
+  Index end = rows;
  Index start = rowBound;
  do
  {
@ -477,9 +482,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(

      // process remaining scalars
      // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<size; ++j)
+      for (Index j=alignedSize; j<depth; ++j)
        tmp0 += cj.pmul(lhs0[j], rhs[j]);
-      res[i] += alpha*tmp0;
+      res[i*resIncr] += alpha*tmp0;
    }
    if (skipRows)
    {
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@ -119,11 +119,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
      if (r>0)
      {
        Index s = IsLower ? 0 : pi + actualPanelWidth;
-        Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1);
-        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>(
+        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
+            actualPanelWidth, r,
            &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
-            &(rhs.const_cast_derived().coeffRef(s)), r,
-            target, alpha);
+            &(rhs.const_cast_derived().coeffRef(s)), 1,
+            &res.coeffRef(pi,0), res.innerStride(), alpha);
      }
    }
  }
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -49,9 +49,10 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index,
 static void ei_cache_friendly_product_colmajor_times_vector(
  Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);

-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static void ei_cache_friendly_product_rowmajor_times_vector(
-  const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha);
+  Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr, Scalar alpha);

 template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
 {
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@ -64,5 +64,16 @@ void test_product_large()
    // only makes sure it compiles fine
    computeProductBlockingSizes<float,float>(k1,m1,n1);
  }
+
+  {
+    // test regression in row-vector by matrix (bad Map type)
+    MatrixXf mat1(10,32); mat1.setRandom();
+    MatrixXf mat2(32,32); mat2.setRandom();
+    MatrixXf r1 = mat1.row(2)*mat2.transpose();
+    VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval());
+
+    MatrixXf r2 = mat1.row(2)*mat2;
+    VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
+  }
 #endif
 }