* fix a couple of remaining issues with previous commit,

* merge ei_product_blocking_traits into ei_gepb_traits
This commit is contained in:
Gael Guennebaud 2010-07-19 15:45:13 +02:00
parent f8aae7a908
commit 6e157dd7c6
8 changed files with 116 additions and 143 deletions

View File

@ -277,13 +277,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a,
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v),
// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/)));
return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v),
_mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0))));
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
// ei_vec2d_swizzle1(b.v, 1, 0))));
return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
ei_vec2d_swizzle1(b.v, 1, 0))));
#else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),

View File

@ -25,6 +25,9 @@
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
class ei_gebp_traits;
/** \internal */
inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
{
@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
* - the register level blocking sizes defined by ei_product_blocking_traits,
* - the register level blocking sizes defined by ei_gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
@ -114,9 +117,9 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
std::ptrdiff_t l1, l2;
enum {
kdiv = KcFactor * 2 * ei_product_blocking_traits<LhsScalar,RhsScalar>::nr
kdiv = KcFactor * 2 * ei_gebp_traits<LhsScalar,RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
mr = ei_product_blocking_traits<LhsScalar,RhsScalar>::mr,
mr = ei_gebp_traits<LhsScalar,RhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
};
@ -184,10 +187,20 @@ public:
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable,
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4)
nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified)
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr * RhsPacketSize,
LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize
@ -250,10 +263,15 @@ public:
enum {
ConjLhs = _ConjLhs,
ConjRhs = false,
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable,
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = NumberOfRegisters/4,
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize
@ -307,12 +325,11 @@ public:
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = ei_pmadd(c,alpha,r);
r = cj.pmadd(c,alpha,r);
}
protected:
// ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
// ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,false> cj;
};
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
@ -331,6 +348,10 @@ public:
&& ei_packet_traits<Scalar>::Vectorizable,
RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
nr = 2,
mr = 2 * ResPacketSize,
WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
LhsProgress = ResPacketSize,
RhsProgress = Vectorizable ? 2*ResPacketSize : 1
@ -424,7 +445,7 @@ public:
else if((ConjLhs)&&(ConjRhs))
{
tmp = ei_pcplxflip(ResPacket(c.second));
tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp);
tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp);
}
r = ei_pmadd(tmp,alpha,r);
@ -434,21 +455,6 @@ protected:
ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
};
/* Specialization for real * complex.
* The only subtility is how the lhs coefficients are loaded.
*/
template<typename Real>
struct ei_product_blocking_traits<Real, std::complex<Real> >
{
typedef std::complex<Real> Scalar;
enum {
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
PacketSize = ei_packet_traits<Scalar>::size,
nr = 4,
mr = 2*PacketSize
};
};
template<typename RealScalar, bool _ConjRhs>
class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
{
@ -465,7 +471,12 @@ public:
&& ei_packet_traits<Scalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
mr = 2*ResPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = ResPacketSize,
RhsProgress = ResPacketSize
@ -500,7 +511,6 @@ public:
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = ei_ploaddup<LhsPacket>(a);
// dest = ei_pload<LhsPacket>(a);
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
@ -878,7 +888,7 @@ EIGEN_ASM_COMMENT("mybegin4");
traits.madd(A0,B1,C1,B1);
traits.loadRhs(&blB[13*RhsProgress], B1);
traits.madd(A0,B2,C2,B2);
traits.loadRhs(&blB[4*RhsProgress], B2);
traits.loadRhs(&blB[14*RhsProgress], B2);
traits.madd(A0,B3,C3,B3);
traits.loadLhs(&blA[3*LhsProgress], A0);

View File

@ -73,22 +73,15 @@ static void run(Index rows, Index cols, Index depth,
ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction
// FIXME starting from SSE3, normal complex product cannot be optimized as well as
// conjugate product, therefore it is better to conjugate during the copies.
// With SSE2, this is the other way round.
ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp;
// ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
// ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
// ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
ei_gemm_pack_lhs<LhsScalar, Index, Traits::mr, LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@ -98,7 +91,7 @@ static void run(Index rows, Index cols, Index depth,
Index threads = omp_get_num_threads();
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
Scalar* blockB = blocking.blockB();
ei_internal_assert(blockB!=0);
@ -170,10 +163,10 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version!
std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*ei_packet_traits<RhsScalar>::size*Blocking::nr*2;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
RhsScalar *blockW = /*blocking.blockW()==0 ?*/ ei_aligned_stack_new(RhsScalar, sizeW) /*: blocking.blockW()*/;
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
@ -302,11 +295,11 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
};
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
SizeA = ActualRows * MaxDepth,
SizeB = ActualCols * MaxDepth,
SizeW = MaxDepth * Blocking::nr * ei_packet_traits<RhsScalar>::size
SizeW = MaxDepth * Traits::nr * ei_packet_traits<RhsScalar>::size
};
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
@ -342,7 +335,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
};
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
DenseIndex m_sizeA;
DenseIndex m_sizeB;
@ -359,7 +352,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc);
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
m_sizeW = this->m_kc*ei_packet_traits<RhsScalar>::size*Blocking::nr;
m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
}
void allocateA()

View File

@ -67,7 +67,7 @@ struct ei_symm_pack_lhs
if(rows-peeled_mc>=PacketSize)
{
pack<PacketSize>(blockA, lhs, cols, peeled_mc, count);
pack<mr/2>(blockA, lhs, cols, peeled_mc, count);
peeled_mc += PacketSize;
}
@ -253,9 +253,9 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
@ -263,14 +263,15 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
kc = std::min(kc,mc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_symm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@ -305,7 +306,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = std::min(i2+mc,size)-i2;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
@ -335,7 +336,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
@ -343,13 +344,14 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
{

View File

@ -68,20 +68,21 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
// if(AAT)
// alpha = ei_conj(alpha);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
// !!! mc must be a multiple of nr:
if(mc>Blocking::nr)
mc = (mc/Blocking::nr)*Blocking::nr;
if(mc>Traits::nr)
mc = (mc/Traits::nr)*Traits::nr;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
// note that the actual rhs is the transpose/adjoint of mat
enum {
@ -89,10 +90,10 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
ConjRhs = NumTraits<Scalar>::IsComplex && AAT
};
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs;
ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,MatStorageOrder, false> pack_lhs;
ei_sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
{

View File

@ -105,9 +105,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
@ -117,18 +117,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
// Scalar* allocatedBlockB = new Scalar[sizeB];
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
triangularBuffer.diagonal().setOnes();
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@ -195,7 +196,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = std::min(i2+mc,end)-i2;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>()
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
@ -228,9 +229,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
@ -240,18 +241,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
triangularBuffer.diagonal().setOnes();
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;

View File

@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs;
ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr,TriStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
for(Index k2=IsLower ? 0 : size;
IsLower ? k2<size : k2>0;
@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking;
typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum {
RhsStorageOrder = TriStorageOrder,
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr),
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
};
// Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
// Index mc = std::min<Index>(Blocking::Max_mc,size); // cache block size along the M direction
// Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction
// Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction
// check that !!!!
Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr;
Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar,Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel;
ei_gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
ei_gemm_pack_lhs<Scalar, Index, Traits::mr, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;

View File

@ -151,39 +151,6 @@ class ei_const_blas_data_mapper
Index m_stride;
};
// Defines various constant controlling register blocking for matrix-matrix algorithms.
template<typename LhsScalar, typename RhsScalar> struct ei_product_blocking_traits;
template<typename LhsScalar, typename RhsScalar>
struct ei_product_blocking_traits
{
enum {
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
&& ei_packet_traits<RhsScalar>::Vectorizable
/*&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4)
nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified)
mr = 2 * LhsPacketSize
};
};
template<typename Real>
struct ei_product_blocking_traits<std::complex<Real>, std::complex<Real> >
{
typedef std::complex<Real> Scalar;
enum {
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
PacketSize = ei_packet_traits<Scalar>::size,
nr = 2,
mr = 2 * PacketSize
};
};
/* Helper class to analyze the factors of a Product expression.
* In particular it allows to pop out operator-, scalar multiples,