mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 11:19:02 +08:00
* GEMM enhencement: no need to pre-transpose the rhs
=> faster a * b.transpose() product => this also fix a bug in a so far untested situation * SYMM is now ready for use => still have to write the high level stuff to convert natural expressions into a call to SYMM
This commit is contained in:
parent
d6475ea390
commit
e7f8e939e2
@ -73,79 +73,6 @@ struct ProductReturnType<Lhs,Rhs,CacheFriendlyProduct>
|
||||
typedef Product<LhsNested, RhsNested, CacheFriendlyProduct> Type;
|
||||
};
|
||||
|
||||
/* Helper class to analyze the factors of a Product expression.
|
||||
* In particular it allows to pop out operator-, scalar multiples,
|
||||
* and conjugate */
|
||||
template<typename XprType> struct ei_blas_traits
|
||||
{
|
||||
typedef typename ei_traits<XprType>::Scalar Scalar;
|
||||
typedef XprType ActualXprType;
|
||||
enum {
|
||||
IsComplex = NumTraits<Scalar>::IsComplex,
|
||||
NeedToConjugate = false,
|
||||
ActualAccess = int(ei_traits<XprType>::Flags)&DirectAccessBit ? HasDirectAccess : NoDirectAccess
|
||||
};
|
||||
typedef typename ei_meta_if<int(ActualAccess)==HasDirectAccess,
|
||||
const ActualXprType&,
|
||||
typename ActualXprType::PlainMatrixType
|
||||
>::ret DirectLinearAccessType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return x; }
|
||||
static inline Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
|
||||
};
|
||||
|
||||
// pop conjugate
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
|
||||
enum {
|
||||
IsComplex = NumTraits<Scalar>::IsComplex,
|
||||
NeedToConjugate = IsComplex
|
||||
};
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x) { return ei_conj(Base::extractScalarFactor(x._expression())); }
|
||||
};
|
||||
|
||||
// pop scalar multiple
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return x._functor().m_other * Base::extractScalarFactor(x._expression()); }
|
||||
};
|
||||
|
||||
// pop opposite
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return - Base::extractScalarFactor(x._expression()); }
|
||||
};
|
||||
|
||||
// pop opposite
|
||||
template<typename NestedXpr> struct ei_blas_traits<NestByValue<NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef typename NestedXpr::Scalar Scalar;
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef NestByValue<NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(static_cast<const NestedXpr&>(x)); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return Base::extractScalarFactor(static_cast<const NestedXpr&>(x)); }
|
||||
};
|
||||
|
||||
/* Helper class to determine the type of the product, can be either:
|
||||
* - NormalProduct
|
||||
* - CacheFriendlyProduct
|
||||
@ -869,25 +796,6 @@ inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFrien
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename T> struct ei_product_copy_rhs
|
||||
{
|
||||
typedef typename ei_meta_if<
|
||||
(ei_traits<T>::Flags & RowMajorBit)
|
||||
|| (!(ei_traits<T>::Flags & DirectAccessBit)),
|
||||
typename ei_plain_matrix_type_column_major<T>::type,
|
||||
const T&
|
||||
>::ret type;
|
||||
};
|
||||
|
||||
template<typename T> struct ei_product_copy_lhs
|
||||
{
|
||||
typedef typename ei_meta_if<
|
||||
(!(int(ei_traits<T>::Flags) & DirectAccessBit)),
|
||||
typename ei_plain_matrix_type<T>::type,
|
||||
const T&
|
||||
>::ret type;
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, int ProductMode>
|
||||
template<typename DestDerived>
|
||||
inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& res, Scalar alpha) const
|
||||
@ -895,26 +803,22 @@ inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived&
|
||||
typedef ei_blas_traits<_LhsNested> LhsProductTraits;
|
||||
typedef ei_blas_traits<_RhsNested> RhsProductTraits;
|
||||
|
||||
typedef typename LhsProductTraits::ActualXprType ActualLhsType;
|
||||
typedef typename RhsProductTraits::ActualXprType ActualRhsType;
|
||||
typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
|
||||
typedef typename RhsProductTraits::DirectLinearAccessType ActualRhsType;
|
||||
|
||||
const ActualLhsType& actualLhs = LhsProductTraits::extract(m_lhs);
|
||||
const ActualRhsType& actualRhs = RhsProductTraits::extract(m_rhs);
|
||||
typedef typename ei_cleantype<ActualLhsType>::type _ActualLhsType;
|
||||
typedef typename ei_cleantype<ActualRhsType>::type _ActualRhsType;
|
||||
|
||||
const ActualLhsType lhs = LhsProductTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsProductTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsProductTraits::extractScalarFactor(m_lhs)
|
||||
* RhsProductTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
typedef typename ei_product_copy_lhs<ActualLhsType>::type LhsCopy;
|
||||
typedef typename ei_unref<LhsCopy>::type _LhsCopy;
|
||||
typedef typename ei_product_copy_rhs<ActualRhsType>::type RhsCopy;
|
||||
typedef typename ei_unref<RhsCopy>::type _RhsCopy;
|
||||
LhsCopy lhs(actualLhs);
|
||||
RhsCopy rhs(actualRhs);
|
||||
|
||||
ei_general_matrix_matrix_product<
|
||||
Scalar,
|
||||
(_LhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsProductTraits::NeedToConjugate),
|
||||
(_RhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsProductTraits::NeedToConjugate),
|
||||
(_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsProductTraits::NeedToConjugate),
|
||||
(_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsProductTraits::NeedToConjugate),
|
||||
(DestDerived::Flags&RowMajorBit)?RowMajor:ColMajor>
|
||||
::run(
|
||||
rows(), cols(), lhs.cols(),
|
||||
|
@ -89,16 +89,16 @@ static void run(int rows, int cols, int depth,
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::PacketSize, Blocking::nr>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
|
||||
// => GEPP_VAR1
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,rows)-i2;
|
||||
|
||||
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
}
|
||||
@ -110,11 +110,13 @@ static void run(int rows, int cols, int depth,
|
||||
};
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj>
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
Conj cj;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
@ -276,7 +278,7 @@ struct ei_gebp_kernel
|
||||
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// process remaining rhs/res columns one at a time
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
@ -353,9 +355,11 @@ struct ei_gemm_pack_lhs
|
||||
};
|
||||
|
||||
// copy a complete panel of the rhs while expending each coefficient into a packet form
|
||||
template<typename Scalar, int PacketSize, int nr>
|
||||
struct ei_gemm_pack_rhs
|
||||
// this version is optimized for column major matrices
|
||||
template<typename Scalar, int nr>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
|
||||
{
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
@ -419,6 +423,61 @@ struct ei_gemm_pack_rhs
|
||||
}
|
||||
};
|
||||
|
||||
// this version is optimized for row major matrices
|
||||
template<typename Scalar, int nr>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
|
||||
{
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||
|
@ -31,11 +31,12 @@ struct ei_symm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
|
||||
{
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
// normal copy
|
||||
for(int k=0; k<i; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs(i+w,k);
|
||||
@ -55,6 +56,7 @@ struct ei_symm_pack_lhs
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs(k, i+w);
|
||||
}
|
||||
|
||||
// do the same with mr==1
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
@ -67,86 +69,278 @@ struct ei_symm_pack_lhs
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar, int nr, int StorageOrder>
|
||||
struct ei_symm_pack_rhs
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols, int k2)
|
||||
{
|
||||
int end_k = k2 + actual_kc;
|
||||
int count = 0;
|
||||
ei_const_blas_data_mapper<Scalar,StorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
// first part: standard case
|
||||
for(int j2=0; j2<k2; j2+=nr)
|
||||
{
|
||||
for(int k=k2; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
|
||||
// second part: diagonal block
|
||||
for(int j2=k2; j2<std::min(k2+actual_kc,packet_cols); j2+=nr)
|
||||
{
|
||||
// again we can split vertically in three different parts (transpose, symmetric, normal)
|
||||
// transpose
|
||||
for(int k=k2; k<j2; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(j2+0,k)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(j2+1,k)));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(j2+2,k)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(j2+3,k)));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
// symmetric
|
||||
int h = 0;
|
||||
for(int k=j2; k<j2+nr; k++)
|
||||
{
|
||||
// normal
|
||||
for (int w=0 ; w<h; ++w)
|
||||
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w)));
|
||||
// transpose
|
||||
for (int w=h ; w<nr; ++w)
|
||||
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(j2+w,k)));
|
||||
count += nr*PacketSize;
|
||||
++h;
|
||||
}
|
||||
// normal
|
||||
for(int k=j2+nr; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
|
||||
// third part: transpose
|
||||
for(int j2=k2+actual_kc; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
for(int k=k2; k<end_k; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(j2+0,k)));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(j2+1,k)));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(j2+2,k)));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(j2+3,k)));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
|
||||
// copy the remaining columns one at a time (=> the same with nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
// transpose
|
||||
int half = std::min(end_k,j2);
|
||||
for(int k=k2; k<half; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(j2,k)));
|
||||
count += PacketSize;
|
||||
}
|
||||
// normal
|
||||
for(int k=half; k<k2+actual_kc; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
|
||||
* the general matrix matrix product.
|
||||
*/
|
||||
template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
|
||||
static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
|
||||
int size,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride, bool rhsRowMajor, int cols,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
template <typename Scalar,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
struct ei_product_selfadjoint_matrix;
|
||||
|
||||
template <typename Scalar,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
|
||||
struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type Packet;
|
||||
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, ColMajor> rhs(_rhs,rhsStride);
|
||||
|
||||
if (ConjugateRhs)
|
||||
alpha = ei_conj(alpha);
|
||||
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
|
||||
const bool lhsRowMajor = (StorageOrder==RowMajor);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
int kc = std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize,
|
||||
Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
int rows, int cols,
|
||||
const Scalar* lhs, int lhsStride,
|
||||
const Scalar* rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,size)-k2;
|
||||
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar,Blocking::PacketSize,Blocking::nr>()
|
||||
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
// 2 - the diagonal block => special packed copy
|
||||
// 3 - the panel below the diagonal block => generic packed copy
|
||||
for(int i2=0; i2<k2; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,k2)-i2;
|
||||
// transposed packed copy
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder==RowMajor?ColMajor:RowMajor>()
|
||||
(blockA, &lhs(k2,i2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
const int actual_mc = std::min(k2+kc,size)-k2;
|
||||
// symmetric packed copy
|
||||
ei_symm_pack_lhs<Scalar,Blocking::mr,StorageOrder>()
|
||||
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
|
||||
}
|
||||
|
||||
for(int i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,size)-i2;
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder>()
|
||||
(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
ei_product_selfadjoint_matrix<Scalar,
|
||||
RhsStorageOrder==RowMajor ? ColMajor : RowMajor, RhsSelfAdjoint, ConjugateRhs,
|
||||
LhsStorageOrder==RowMajor ? ColMajor : RowMajor, LhsSelfAdjoint, ConjugateLhs, ColMajor>
|
||||
::run(rows, cols, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha);
|
||||
}
|
||||
};
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
}
|
||||
template <typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
int rows, int cols,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
int size = rows;
|
||||
|
||||
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
if (ConjugateRhs)
|
||||
alpha = ei_conj(alpha);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
int kc = std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,size)-k2;
|
||||
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
|
||||
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
// 2 - the diagonal block => special packed copy
|
||||
// 3 - the panel below the diagonal block => generic packed copy
|
||||
for(int i2=0; i2<k2; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,k2)-i2;
|
||||
// transposed packed copy if Lower part
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor>()
|
||||
(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
const int actual_mc = std::min(k2+kc,size)-k2;
|
||||
// symmetric packed copy
|
||||
ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
||||
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
|
||||
}
|
||||
|
||||
for(int i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,size)-i2;
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
}
|
||||
};
|
||||
|
||||
// matrix * selfadjoint product
|
||||
template <typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
int rows, int cols,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
int size = cols;
|
||||
|
||||
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
if (ConjugateRhs)
|
||||
alpha = ei_conj(alpha);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
int kc = std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,size)-k2;
|
||||
|
||||
ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
|
||||
(blockB, _rhs, rhsStride, alpha, actual_kc, packet_cols, cols, k2);
|
||||
|
||||
// => GEPP
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,rows)-i2;
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
|
||||
|
@ -29,10 +29,10 @@
|
||||
// implement and control fast level 2 and level 3 BLAS-like routines.
|
||||
|
||||
// forward declarations
|
||||
template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj>
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel;
|
||||
|
||||
template<typename Scalar, int PacketSize, int nr>
|
||||
template<typename Scalar, int nr, int StorageOrder>
|
||||
struct ei_gemm_pack_rhs;
|
||||
|
||||
template<typename Scalar, int mr, int StorageOrder>
|
||||
@ -154,4 +154,77 @@ struct ei_product_blocking_traits
|
||||
};
|
||||
};
|
||||
|
||||
/* Helper class to analyze the factors of a Product expression.
|
||||
* In particular it allows to pop out operator-, scalar multiples,
|
||||
* and conjugate */
|
||||
template<typename XprType> struct ei_blas_traits
|
||||
{
|
||||
typedef typename ei_traits<XprType>::Scalar Scalar;
|
||||
typedef XprType ActualXprType;
|
||||
enum {
|
||||
IsComplex = NumTraits<Scalar>::IsComplex,
|
||||
NeedToConjugate = false,
|
||||
ActualAccess = int(ei_traits<XprType>::Flags)&DirectAccessBit ? HasDirectAccess : NoDirectAccess
|
||||
};
|
||||
typedef typename ei_meta_if<int(ActualAccess)==HasDirectAccess,
|
||||
const ActualXprType&,
|
||||
typename ActualXprType::PlainMatrixType
|
||||
>::ret DirectLinearAccessType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return x; }
|
||||
static inline Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
|
||||
};
|
||||
|
||||
// pop conjugate
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
|
||||
enum {
|
||||
IsComplex = NumTraits<Scalar>::IsComplex,
|
||||
NeedToConjugate = IsComplex
|
||||
};
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x) { return ei_conj(Base::extractScalarFactor(x._expression())); }
|
||||
};
|
||||
|
||||
// pop scalar multiple
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return x._functor().m_other * Base::extractScalarFactor(x._expression()); }
|
||||
};
|
||||
|
||||
// pop opposite
|
||||
template<typename Scalar, typename NestedXpr> struct ei_blas_traits<CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(x._expression()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return - Base::extractScalarFactor(x._expression()); }
|
||||
};
|
||||
|
||||
// pop opposite
|
||||
template<typename NestedXpr> struct ei_blas_traits<NestByValue<NestedXpr> >
|
||||
: ei_blas_traits<NestedXpr>
|
||||
{
|
||||
typedef typename NestedXpr::Scalar Scalar;
|
||||
typedef ei_blas_traits<NestedXpr> Base;
|
||||
typedef NestByValue<NestedXpr> XprType;
|
||||
typedef typename Base::ActualXprType ActualXprType;
|
||||
static inline const ActualXprType& extract(const XprType& x) { return Base::extract(static_cast<const NestedXpr&>(x)); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
{ return Base::extractScalarFactor(static_cast<const NestedXpr&>(x)); }
|
||||
};
|
||||
|
||||
#endif // EIGEN_BLASUTIL_H
|
||||
|
Loading…
x
Reference in New Issue
Block a user