* generalize rowmajor by vector

* fix weird compilation error when constructing a matrix with a row by matrix product
This commit is contained in:
Gael Guennebaud 2010-07-10 22:53:27 +02:00
parent c4ef69b5bd
commit e5bc9526f1
6 changed files with 57 additions and 34 deletions

View File

@ -324,6 +324,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::ActualRhsType ActualRhsType;
typedef typename ProductType::LhsBlasTraits LhsBlasTraits; typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
typedef typename ProductType::RhsBlasTraits RhsBlasTraits; typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;
ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
@ -342,7 +343,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
else else
{ {
actualDest = ei_aligned_stack_new(Scalar,dest.size()); actualDest = ei_aligned_stack_new(Scalar,dest.size());
Map<typename Dest::PlainObject>(actualDest, dest.size()) = dest; MappedDest(actualDest, dest.size()) = dest;
} }
ei_cache_friendly_product_colmajor_times_vector ei_cache_friendly_product_colmajor_times_vector
@ -353,7 +354,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
if (!EvalToDest) if (!EvalToDest)
{ {
dest = Map<typename Dest::PlainObject>(actualDest, dest.size()); dest = MappedDest(actualDest, dest.size());
ei_aligned_stack_delete(Scalar, actualDest, dest.size()); ei_aligned_stack_delete(Scalar, actualDest, dest.size());
} }
} }
@ -365,6 +366,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
{ {
typedef typename ProductType::Scalar Scalar; typedef typename ProductType::Scalar Scalar;
typedef typename ProductType::Index Index;
typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualLhsType ActualLhsType;
typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::ActualRhsType ActualRhsType;
typedef typename ProductType::_ActualRhsType _ActualRhsType; typedef typename ProductType::_ActualRhsType _ActualRhsType;
@ -394,9 +396,12 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
} }
ei_cache_friendly_product_rowmajor_times_vector ei_cache_friendly_product_rowmajor_times_vector
<LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>( <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
actualLhs.rows(), actualLhs.cols(),
&actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
rhs_data, prod.rhs().size(), dest, actualAlpha); rhs_data, 1,
&dest.coeffRef(0,0), dest.innerStride(),
actualAlpha);
if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size()); if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
} }

View File

@ -80,12 +80,13 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
// 2 - it is slighlty faster at runtime // 2 - it is slighlty faster at runtime
Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startRow = IsLower ? pi : pi-actualPanelWidth;
Index startCol = IsLower ? 0 : pi; Index startCol = IsLower ? 0 : pi;
VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);
ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>( ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
actualPanelWidth, r,
&(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(), &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
&(other.coeffRef(startCol)), r, &(other.coeffRef(startCol)), other.innerStride(),
target, Scalar(-1)); &other.coeffRef(startRow), other.innerStride(),
Scalar(-1));
} }
for(Index k=0; k<actualPanelWidth; ++k) for(Index k=0; k<actualPanelWidth; ++k)

View File

@ -258,13 +258,15 @@ void ei_cache_friendly_product_colmajor_times_vector(
} }
// TODO add peeling to mask unaligned load/stores // TODO add peeling to mask unaligned load/stores
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType> template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
Index rows, Index cols,
const Scalar* lhs, Index lhsStride, const Scalar* lhs, Index lhsStride,
const Scalar* rhs, Index rhsSize, const Scalar* rhs, Index rhsIncr,
ResType& res, Scalar* res, Index resIncr,
Scalar alpha) Scalar alpha)
{ {
ei_internal_assert(rhsIncr==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS #ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined #error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif #endif
@ -291,13 +293,13 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
const Index peels = 2; const Index peels = 2;
const Index PacketAlignedMask = PacketSize-1; const Index PacketAlignedMask = PacketSize-1;
const Index PeelAlignedMask = PacketSize*peels-1; const Index PeelAlignedMask = PacketSize*peels-1;
const Index size = rhsSize; const Index depth = cols;
// How many coeffs of the result do we have to skip to be aligned. // How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type // Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below. // if that's not the case then vectorization is discarded, see below.
Index alignedStart = ei_first_aligned(rhs, size); Index alignedStart = ei_first_aligned(rhs, depth);
Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
@ -306,7 +308,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
: FirstAligned; : FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices // we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = ei_first_aligned(lhs,size); const Index lhsAlignmentOffset = ei_first_aligned(lhs,depth);
// find how many rows do we have to skip to be aligned with rhs (if possible) // find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0; Index skipRows = 0;
@ -318,7 +320,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
} }
else if (PacketSize>1) else if (PacketSize>1)
{ {
ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || depth<PacketSize);
while (skipRows<PacketSize && while (skipRows<PacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize)) alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
@ -331,26 +333,26 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
} }
else else
{ {
skipRows = std::min(skipRows,Index(res.size())); skipRows = std::min(skipRows,Index(rows));
// note that the skiped columns are processed later. // note that the skiped columns are processed later.
} }
ei_internal_assert( alignmentPattern==NoneAligned ei_internal_assert( alignmentPattern==NoneAligned
|| PacketSize==1 || PacketSize==1
|| (skipRows + rowsAtOnce >= res.size()) || (skipRows + rowsAtOnce >= rows)
|| PacketSize > rhsSize || PacketSize > depth
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
} }
else if(Vectorizable) else if(Vectorizable)
{ {
alignedStart = 0; alignedStart = 0;
alignedSize = size; alignedSize = depth;
alignmentPattern = AllAligned; alignmentPattern = AllAligned;
} }
Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{ {
EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
@ -439,17 +441,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// process remaining coeffs (or all if no explicit vectorization) // process remaining coeffs (or all if no explicit vectorization)
// FIXME this loop get vectorized by the compiler ! // FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<size; ++j) for (Index j=alignedSize; j<depth; ++j)
{ {
Scalar b = rhs[j]; Scalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
} }
res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3; res[i*resIncr] += alpha*tmp0;
res[(i+offset1)*resIncr] += alpha*tmp1;
res[(i+2)*resIncr] += alpha*tmp2;
res[(i+offset3)*resIncr] += alpha*tmp3;
} }
// process remaining first and last rows (at most columnsAtOnce-1) // process remaining first and last rows (at most columnsAtOnce-1)
Index end = res.size(); Index end = rows;
Index start = rowBound; Index start = rowBound;
do do
{ {
@ -477,9 +482,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// process remaining scalars // process remaining scalars
// FIXME this loop get vectorized by the compiler ! // FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<size; ++j) for (Index j=alignedSize; j<depth; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]); tmp0 += cj.pmul(lhs0[j], rhs[j]);
res[i] += alpha*tmp0; res[i*resIncr] += alpha*tmp0;
} }
if (skipRows) if (skipRows)
{ {

View File

@ -119,11 +119,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
if (r>0) if (r>0)
{ {
Index s = IsLower ? 0 : pi + actualPanelWidth; Index s = IsLower ? 0 : pi + actualPanelWidth;
Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1); ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>( actualPanelWidth, r,
&(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(), &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
&(rhs.const_cast_derived().coeffRef(s)), r, &(rhs.const_cast_derived().coeffRef(s)), 1,
target, alpha); &res.coeffRef(pi,0), res.innerStride(), alpha);
} }
} }
} }

View File

@ -49,9 +49,10 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index,
static void ei_cache_friendly_product_colmajor_times_vector( static void ei_cache_friendly_product_colmajor_times_vector(
Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType> template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
static void ei_cache_friendly_product_rowmajor_times_vector( static void ei_cache_friendly_product_rowmajor_times_vector(
const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha); Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
Scalar* res, Index resIncr, Scalar alpha);
template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false> template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
{ {

View File

@ -64,5 +64,16 @@ void test_product_large()
// only makes sure it compiles fine // only makes sure it compiles fine
computeProductBlockingSizes<float,float>(k1,m1,n1); computeProductBlockingSizes<float,float>(k1,m1,n1);
} }
{
// test regression in row-vector by matrix (bad Map type)
MatrixXf mat1(10,32); mat1.setRandom();
MatrixXf mat2(32,32); mat2.setRandom();
MatrixXf r1 = mat1.row(2)*mat2.transpose();
VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval());
MatrixXf r2 = mat1.row(2)*mat2;
VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
}
#endif #endif
} }