mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
* fix a couple of remaining issues with previous commit,
* merge ei_product_blocking_traits into ei_gepb_traits
This commit is contained in:
parent
f8aae7a908
commit
6e157dd7c6
@ -277,13 +277,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a,
|
||||
{
|
||||
// TODO optimize it for SSE3 and 4
|
||||
#ifdef EIGEN_VECTORIZE_SSE3
|
||||
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v),
|
||||
// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/)));
|
||||
return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v),
|
||||
_mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||
// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||
// ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||
return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||
ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||
#else
|
||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
|
||||
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||
|
@ -25,6 +25,9 @@
|
||||
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
|
||||
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
|
||||
class ei_gebp_traits;
|
||||
|
||||
/** \internal */
|
||||
inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
|
||||
{
|
||||
@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
|
||||
* for matrix products and related algorithms. The blocking sizes depends on various
|
||||
* parameters:
|
||||
* - the L1 and L2 cache sizes,
|
||||
* - the register level blocking sizes defined by ei_product_blocking_traits,
|
||||
* - the register level blocking sizes defined by ei_gebp_traits,
|
||||
* - the number of scalars that fit into a packet (when vectorization is enabled).
|
||||
*
|
||||
* \sa setCpuCacheSizes */
|
||||
@ -114,9 +117,9 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
|
||||
std::ptrdiff_t l1, l2;
|
||||
|
||||
enum {
|
||||
kdiv = KcFactor * 2 * ei_product_blocking_traits<LhsScalar,RhsScalar>::nr
|
||||
kdiv = KcFactor * 2 * ei_gebp_traits<LhsScalar,RhsScalar>::nr
|
||||
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
|
||||
mr = ei_product_blocking_traits<LhsScalar,RhsScalar>::mr,
|
||||
mr = ei_gebp_traits<LhsScalar,RhsScalar>::mr,
|
||||
mr_mask = (0xffffffff/mr)*mr
|
||||
};
|
||||
|
||||
@ -184,10 +187,20 @@ public:
|
||||
enum {
|
||||
ConjLhs = _ConjLhs,
|
||||
ConjRhs = _ConjRhs,
|
||||
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable,
|
||||
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
|
||||
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
||||
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
|
||||
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
|
||||
|
||||
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
||||
|
||||
// register block size along the N direction (must be either 2 or 4)
|
||||
nr = NumberOfRegisters/4,
|
||||
|
||||
// register block size along the M direction (currently, this one cannot be modified)
|
||||
mr = 2 * LhsPacketSize,
|
||||
|
||||
WorkSpaceFactor = nr * RhsPacketSize,
|
||||
|
||||
LhsProgress = LhsPacketSize,
|
||||
RhsProgress = RhsPacketSize
|
||||
@ -250,10 +263,15 @@ public:
|
||||
enum {
|
||||
ConjLhs = _ConjLhs,
|
||||
ConjRhs = false,
|
||||
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable,
|
||||
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
|
||||
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
||||
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
|
||||
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
|
||||
|
||||
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
||||
nr = NumberOfRegisters/4,
|
||||
mr = 2 * LhsPacketSize,
|
||||
WorkSpaceFactor = nr*RhsPacketSize,
|
||||
|
||||
LhsProgress = LhsPacketSize,
|
||||
RhsProgress = RhsPacketSize
|
||||
@ -307,12 +325,11 @@ public:
|
||||
|
||||
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
||||
{
|
||||
r = ei_pmadd(c,alpha,r);
|
||||
r = cj.pmadd(c,alpha,r);
|
||||
}
|
||||
|
||||
protected:
|
||||
// ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
||||
// ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
|
||||
ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,false> cj;
|
||||
};
|
||||
|
||||
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
|
||||
@ -331,6 +348,10 @@ public:
|
||||
&& ei_packet_traits<Scalar>::Vectorizable,
|
||||
RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1,
|
||||
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
|
||||
|
||||
nr = 2,
|
||||
mr = 2 * ResPacketSize,
|
||||
WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
|
||||
|
||||
LhsProgress = ResPacketSize,
|
||||
RhsProgress = Vectorizable ? 2*ResPacketSize : 1
|
||||
@ -424,7 +445,7 @@ public:
|
||||
else if((ConjLhs)&&(ConjRhs))
|
||||
{
|
||||
tmp = ei_pcplxflip(ResPacket(c.second));
|
||||
tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp);
|
||||
tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp);
|
||||
}
|
||||
|
||||
r = ei_pmadd(tmp,alpha,r);
|
||||
@ -434,21 +455,6 @@ protected:
|
||||
ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
||||
};
|
||||
|
||||
/* Specialization for real * complex.
|
||||
* The only subtility is how the lhs coefficients are loaded.
|
||||
*/
|
||||
template<typename Real>
|
||||
struct ei_product_blocking_traits<Real, std::complex<Real> >
|
||||
{
|
||||
typedef std::complex<Real> Scalar;
|
||||
enum {
|
||||
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
|
||||
PacketSize = ei_packet_traits<Scalar>::size,
|
||||
nr = 4,
|
||||
mr = 2*PacketSize
|
||||
};
|
||||
};
|
||||
|
||||
template<typename RealScalar, bool _ConjRhs>
|
||||
class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
|
||||
{
|
||||
@ -465,7 +471,12 @@ public:
|
||||
&& ei_packet_traits<Scalar>::Vectorizable,
|
||||
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
||||
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
|
||||
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
|
||||
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
|
||||
|
||||
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
||||
nr = 4,
|
||||
mr = 2*ResPacketSize,
|
||||
WorkSpaceFactor = nr*RhsPacketSize,
|
||||
|
||||
LhsProgress = ResPacketSize,
|
||||
RhsProgress = ResPacketSize
|
||||
@ -500,7 +511,6 @@ public:
|
||||
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
||||
{
|
||||
dest = ei_ploaddup<LhsPacket>(a);
|
||||
// dest = ei_pload<LhsPacket>(a);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
|
||||
@ -878,7 +888,7 @@ EIGEN_ASM_COMMENT("mybegin4");
|
||||
traits.madd(A0,B1,C1,B1);
|
||||
traits.loadRhs(&blB[13*RhsProgress], B1);
|
||||
traits.madd(A0,B2,C2,B2);
|
||||
traits.loadRhs(&blB[4*RhsProgress], B2);
|
||||
traits.loadRhs(&blB[14*RhsProgress], B2);
|
||||
traits.madd(A0,B3,C3,B3);
|
||||
|
||||
traits.loadLhs(&blA[3*LhsProgress], A0);
|
||||
|
@ -73,22 +73,15 @@ static void run(Index rows, Index cols, Index depth,
|
||||
ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
|
||||
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
|
||||
//Index nc = blocking.nc(); // cache block size along the N direction
|
||||
|
||||
// FIXME starting from SSE3, normal complex product cannot be optimized as well as
|
||||
// conjugate product, therefore it is better to conjugate during the copies.
|
||||
// With SSE2, this is the other way round.
|
||||
ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
|
||||
ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
|
||||
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp;
|
||||
|
||||
// ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
|
||||
// ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
|
||||
// ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
ei_gemm_pack_lhs<LhsScalar, Index, Traits::mr, LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
if(info)
|
||||
@ -98,7 +91,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
Index threads = omp_get_num_threads();
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
|
||||
Scalar* blockB = blocking.blockB();
|
||||
ei_internal_assert(blockB!=0);
|
||||
@ -170,10 +163,10 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// this is the sequential version!
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeW = kc*ei_packet_traits<RhsScalar>::size*Blocking::nr*2;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
|
||||
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
|
||||
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
|
||||
RhsScalar *blockW = /*blocking.blockW()==0 ?*/ ei_aligned_stack_new(RhsScalar, sizeW) /*: blocking.blockW()*/;
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
@ -302,11 +295,11 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
|
||||
};
|
||||
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
|
||||
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
|
||||
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
|
||||
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
enum {
|
||||
SizeA = ActualRows * MaxDepth,
|
||||
SizeB = ActualCols * MaxDepth,
|
||||
SizeW = MaxDepth * Blocking::nr * ei_packet_traits<RhsScalar>::size
|
||||
SizeW = MaxDepth * Traits::nr * ei_packet_traits<RhsScalar>::size
|
||||
};
|
||||
|
||||
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
|
||||
@ -342,7 +335,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
|
||||
};
|
||||
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
|
||||
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
|
||||
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
|
||||
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
DenseIndex m_sizeA;
|
||||
DenseIndex m_sizeB;
|
||||
@ -359,7 +352,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc);
|
||||
m_sizeA = this->m_mc * this->m_kc;
|
||||
m_sizeB = this->m_kc * this->m_nc;
|
||||
m_sizeW = this->m_kc*ei_packet_traits<RhsScalar>::size*Blocking::nr;
|
||||
m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
|
||||
}
|
||||
|
||||
void allocateA()
|
||||
|
@ -67,7 +67,7 @@ struct ei_symm_pack_lhs
|
||||
|
||||
if(rows-peeled_mc>=PacketSize)
|
||||
{
|
||||
pack<PacketSize>(blockA, lhs, cols, peeled_mc, count);
|
||||
pack<mr/2>(blockA, lhs, cols, peeled_mc, count);
|
||||
peeled_mc += PacketSize;
|
||||
}
|
||||
|
||||
@ -253,9 +253,9 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
|
||||
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
Index nc = cols; // cache block size along the N direction
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||
@ -263,14 +263,15 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
|
||||
kc = std::min(kc,mc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_symm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
@ -305,7 +306,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
|
||||
for(Index i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = std::min(i2+mc,size)-i2;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
@ -335,7 +336,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
|
||||
|
||||
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
@ -343,13 +344,14 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
|
@ -68,20 +68,21 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
|
||||
// if(AAT)
|
||||
// alpha = ei_conj(alpha);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
Index kc = depth; // cache block size along the K direction
|
||||
Index mc = size; // cache block size along the M direction
|
||||
Index nc = size; // cache block size along the N direction
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||
// !!! mc must be a multiple of nr:
|
||||
if(mc>Blocking::nr)
|
||||
mc = (mc/Blocking::nr)*Blocking::nr;
|
||||
if(mc>Traits::nr)
|
||||
mc = (mc/Traits::nr)*Traits::nr;
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*size;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
// note that the actual rhs is the transpose/adjoint of mat
|
||||
enum {
|
||||
@ -89,10 +90,10 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
|
||||
ConjRhs = NumTraits<Scalar>::IsComplex && AAT
|
||||
};
|
||||
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs;
|
||||
ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,MatStorageOrder, false> pack_lhs;
|
||||
ei_sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb;
|
||||
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
|
@ -105,9 +105,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
};
|
||||
|
||||
@ -117,18 +117,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
// Scalar* allocatedBlockB = new Scalar[sizeB];
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=IsLower ? depth : 0;
|
||||
IsLower ? k2>0 : k2<depth;
|
||||
@ -195,7 +196,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
for(Index i2=start; i2<end; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = std::min(i2+mc,end)-i2;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
@ -228,9 +229,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
};
|
||||
|
||||
@ -240,18 +241,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? 0 : depth;
|
||||
IsLower ? k2<depth : k2>0;
|
||||
|
@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
|
||||
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
|
||||
ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
};
|
||||
|
||||
@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
|
||||
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
ei_conj_if<Conjugate> conj;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs;
|
||||
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,TriStorageOrder> pack_lhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
|
||||
|
||||
for(Index k2=IsLower ? 0 : size;
|
||||
IsLower ? k2<size : k2>0;
|
||||
@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
|
||||
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
|
||||
ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
|
||||
typedef ei_gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
RhsStorageOrder = TriStorageOrder,
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
};
|
||||
|
||||
// Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
|
||||
// Index mc = std::min<Index>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
// Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction
|
||||
// Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction
|
||||
// check that !!!!
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = size; // cache block size along the M direction
|
||||
@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
|
||||
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*size;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
|
||||
Scalar* blockB = allocatedBlockB + sizeW;
|
||||
|
||||
ei_conj_if<Conjugate> conj;
|
||||
ei_gebp_kernel<Scalar,Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel;
|
||||
ei_gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Traits::mr, ColMajor, false, true> pack_lhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? size : 0;
|
||||
IsLower ? k2>0 : k2<size;
|
||||
|
@ -151,39 +151,6 @@ class ei_const_blas_data_mapper
|
||||
Index m_stride;
|
||||
};
|
||||
|
||||
// Defines various constant controlling register blocking for matrix-matrix algorithms.
|
||||
template<typename LhsScalar, typename RhsScalar> struct ei_product_blocking_traits;
|
||||
|
||||
template<typename LhsScalar, typename RhsScalar>
|
||||
struct ei_product_blocking_traits
|
||||
{
|
||||
enum {
|
||||
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
|
||||
&& ei_packet_traits<RhsScalar>::Vectorizable
|
||||
/*&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|
||||
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
|
||||
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
||||
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
||||
|
||||
// register block size along the N direction (must be either 2 or 4)
|
||||
nr = NumberOfRegisters/4,
|
||||
|
||||
// register block size along the M direction (currently, this one cannot be modified)
|
||||
mr = 2 * LhsPacketSize
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Real>
|
||||
struct ei_product_blocking_traits<std::complex<Real>, std::complex<Real> >
|
||||
{
|
||||
typedef std::complex<Real> Scalar;
|
||||
enum {
|
||||
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
|
||||
PacketSize = ei_packet_traits<Scalar>::size,
|
||||
nr = 2,
|
||||
mr = 2 * PacketSize
|
||||
};
|
||||
};
|
||||
|
||||
/* Helper class to analyze the factors of a Product expression.
|
||||
* In particular it allows to pop out operator-, scalar multiples,
|
||||
|
Loading…
x
Reference in New Issue
Block a user