* fix a couple of remaining issues with previous commit,

* merge ei_product_blocking_traits into ei_gepb_traits
This commit is contained in:
Gael Guennebaud 2010-07-19 15:45:13 +02:00
parent f8aae7a908
commit 6e157dd7c6
8 changed files with 116 additions and 143 deletions

View File

@ -277,13 +277,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a,
{ {
// TODO optimize it for SSE3 and 4 // TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3 #ifdef EIGEN_VECTORIZE_SSE3
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v), return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/))); _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v), ei_vec2d_swizzle1(b.v, 1, 0))));
_mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0))));
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
// ei_vec2d_swizzle1(b.v, 1, 0))));
#else #else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),

View File

@ -25,6 +25,9 @@
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
class ei_gebp_traits;
/** \internal */ /** \internal */
inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
{ {
@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
* for matrix products and related algorithms. The blocking sizes depends on various * for matrix products and related algorithms. The blocking sizes depends on various
* parameters: * parameters:
* - the L1 and L2 cache sizes, * - the L1 and L2 cache sizes,
* - the register level blocking sizes defined by ei_product_blocking_traits, * - the register level blocking sizes defined by ei_gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled). * - the number of scalars that fit into a packet (when vectorization is enabled).
* *
* \sa setCpuCacheSizes */ * \sa setCpuCacheSizes */
@ -114,9 +117,9 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
std::ptrdiff_t l1, l2; std::ptrdiff_t l1, l2;
enum { enum {
kdiv = KcFactor * 2 * ei_product_blocking_traits<LhsScalar,RhsScalar>::nr kdiv = KcFactor * 2 * ei_gebp_traits<LhsScalar,RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar), * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
mr = ei_product_blocking_traits<LhsScalar,RhsScalar>::mr, mr = ei_gebp_traits<LhsScalar,RhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr mr_mask = (0xffffffff/mr)*mr
}; };
@ -184,11 +187,21 @@ public:
enum { enum {
ConjLhs = _ConjLhs, ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs, ConjRhs = _ConjRhs,
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable, Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4)
nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified)
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr * RhsPacketSize,
LhsProgress = LhsPacketSize, LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize RhsProgress = RhsPacketSize
}; };
@ -250,11 +263,16 @@ public:
enum { enum {
ConjLhs = _ConjLhs, ConjLhs = _ConjLhs,
ConjRhs = false, ConjRhs = false,
Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable, Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = NumberOfRegisters/4,
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = LhsPacketSize, LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize RhsProgress = RhsPacketSize
}; };
@ -307,12 +325,11 @@ public:
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{ {
r = ei_pmadd(c,alpha,r); r = cj.pmadd(c,alpha,r);
} }
protected: protected:
// ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,false> cj;
// ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
}; };
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
@ -332,6 +349,10 @@ public:
RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1, RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
nr = 2,
mr = 2 * ResPacketSize,
WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
LhsProgress = ResPacketSize, LhsProgress = ResPacketSize,
RhsProgress = Vectorizable ? 2*ResPacketSize : 1 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
}; };
@ -424,7 +445,7 @@ public:
else if((ConjLhs)&&(ConjRhs)) else if((ConjLhs)&&(ConjRhs))
{ {
tmp = ei_pcplxflip(ResPacket(c.second)); tmp = ei_pcplxflip(ResPacket(c.second));
tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp); tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp);
} }
r = ei_pmadd(tmp,alpha,r); r = ei_pmadd(tmp,alpha,r);
@ -434,21 +455,6 @@ protected:
ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
}; };
/* Specialization for real * complex.
* The only subtility is how the lhs coefficients are loaded.
*/
template<typename Real>
struct ei_product_blocking_traits<Real, std::complex<Real> >
{
typedef std::complex<Real> Scalar;
enum {
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
PacketSize = ei_packet_traits<Scalar>::size,
nr = 4,
mr = 2*PacketSize
};
};
template<typename RealScalar, bool _ConjRhs> template<typename RealScalar, bool _ConjRhs>
class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
{ {
@ -465,7 +471,12 @@ public:
&& ei_packet_traits<Scalar>::Vectorizable, && ei_packet_traits<Scalar>::Vectorizable,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
mr = 2*ResPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = ResPacketSize, LhsProgress = ResPacketSize,
RhsProgress = ResPacketSize RhsProgress = ResPacketSize
@ -500,7 +511,6 @@ public:
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{ {
dest = ei_ploaddup<LhsPacket>(a); dest = ei_ploaddup<LhsPacket>(a);
// dest = ei_pload<LhsPacket>(a);
} }
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
@ -878,7 +888,7 @@ EIGEN_ASM_COMMENT("mybegin4");
traits.madd(A0,B1,C1,B1); traits.madd(A0,B1,C1,B1);
traits.loadRhs(&blB[13*RhsProgress], B1); traits.loadRhs(&blB[13*RhsProgress], B1);
traits.madd(A0,B2,C2,B2); traits.madd(A0,B2,C2,B2);
traits.loadRhs(&blB[4*RhsProgress], B2); traits.loadRhs(&blB[14*RhsProgress], B2);
traits.madd(A0,B3,C3,B3); traits.madd(A0,B3,C3,B3);
traits.loadLhs(&blA[3*LhsProgress], A0); traits.loadLhs(&blA[3*LhsProgress], A0);

View File

@ -73,22 +73,15 @@ static void run(Index rows, Index cols, Index depth,
ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction //Index nc = blocking.nc(); // cache block size along the N direction
// FIXME starting from SSE3, normal complex product cannot be optimized as well as ei_gemm_pack_lhs<LhsScalar, Index, Traits::mr, LhsStorageOrder> pack_lhs;
// conjugate product, therefore it is better to conjugate during the copies. ei_gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
// With SSE2, this is the other way round. ei_gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp;
// ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
// ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
// ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP #ifdef EIGEN_HAS_OPENMP
if(info) if(info)
@ -98,7 +91,7 @@ static void run(Index rows, Index cols, Index depth,
Index threads = omp_get_num_threads(); Index threads = omp_get_num_threads();
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
Scalar* w = ei_aligned_stack_new(Scalar, sizeW); Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
Scalar* blockB = blocking.blockB(); Scalar* blockB = blocking.blockB();
ei_internal_assert(blockB!=0); ei_internal_assert(blockB!=0);
@ -170,10 +163,10 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version! // this is the sequential version!
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*ei_packet_traits<RhsScalar>::size*Blocking::nr*2; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA(); LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); RhsScalar *blockW = /*blocking.blockW()==0 ?*/ ei_aligned_stack_new(RhsScalar, sizeW) /*: blocking.blockW()*/;
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1) // (==GEMM_VAR1)
@ -302,11 +295,11 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
}; };
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar; typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar; typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
enum { enum {
SizeA = ActualRows * MaxDepth, SizeA = ActualRows * MaxDepth,
SizeB = ActualCols * MaxDepth, SizeB = ActualCols * MaxDepth,
SizeW = MaxDepth * Blocking::nr * ei_packet_traits<RhsScalar>::size SizeW = MaxDepth * Traits::nr * ei_packet_traits<RhsScalar>::size
}; };
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
@ -342,7 +335,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
}; };
typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar; typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar;
typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar; typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar;
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits;
DenseIndex m_sizeA; DenseIndex m_sizeA;
DenseIndex m_sizeB; DenseIndex m_sizeB;
@ -359,7 +352,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols
computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc); computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc);
m_sizeA = this->m_mc * this->m_kc; m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc; m_sizeB = this->m_kc * this->m_nc;
m_sizeW = this->m_kc*ei_packet_traits<RhsScalar>::size*Blocking::nr; m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
} }
void allocateA() void allocateA()

View File

@ -67,7 +67,7 @@ struct ei_symm_pack_lhs
if(rows-peeled_mc>=PacketSize) if(rows-peeled_mc>=PacketSize)
{ {
pack<PacketSize>(blockA, lhs, cols, peeled_mc, count); pack<mr/2>(blockA, lhs, cols, peeled_mc, count);
peeled_mc += PacketSize; peeled_mc += PacketSize;
} }
@ -253,9 +253,9 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
@ -263,14 +263,15 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
kc = std::min(kc,mc); kc = std::min(kc,mc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_symm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc) for(Index k2=0; k2<size; k2+=kc)
{ {
@ -305,7 +306,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
for(Index i2=k2+kc; i2<size; i2+=mc) for(Index i2=k2+kc; i2<size; i2+=mc)
{ {
const Index actual_mc = std::min(i2+mc,size)-i2; const Index actual_mc = std::min(i2+mc,size)-i2;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
@ -335,7 +336,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction Index mc = rows; // cache block size along the M direction
@ -343,13 +344,14 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc) for(Index k2=0; k2<size; k2+=kc)
{ {

View File

@ -68,20 +68,21 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
// if(AAT) // if(AAT)
// alpha = ei_conj(alpha); // alpha = ei_conj(alpha);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
Index kc = depth; // cache block size along the K direction Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
// !!! mc must be a multiple of nr: // !!! mc must be a multiple of nr:
if(mc>Blocking::nr) if(mc>Traits::nr)
mc = (mc/Blocking::nr)*Blocking::nr; mc = (mc/Traits::nr)*Traits::nr;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
// note that the actual rhs is the transpose/adjoint of mat // note that the actual rhs is the transpose/adjoint of mat
enum { enum {
@ -89,10 +90,10 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
ConjRhs = NumTraits<Scalar>::IsComplex && AAT ConjRhs = NumTraits<Scalar>::IsComplex && AAT
}; };
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,MatStorageOrder, false> pack_lhs;
ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb; ei_sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc) for(Index k2=0; k2<depth; k2+=kc)
{ {

View File

@ -105,9 +105,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum { enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower IsLower = (Mode&Lower) == Lower
}; };
@ -117,18 +117,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
// Scalar* allocatedBlockB = new Scalar[sizeB]; // Scalar* allocatedBlockB = new Scalar[sizeB];
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
triangularBuffer.diagonal().setOnes(); triangularBuffer.diagonal().setOnes();
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0; for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth; IsLower ? k2>0 : k2<depth;
@ -195,7 +196,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc) for(Index i2=start; i2<end; i2+=mc)
{ {
const Index actual_mc = std::min(i2+mc,end)-i2; const Index actual_mc = std::min(i2+mc,end)-i2;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
@ -228,9 +229,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum { enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower IsLower = (Mode&Lower) == Lower
}; };
@ -240,18 +241,19 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
triangularBuffer.diagonal().setOnes(); triangularBuffer.diagonal().setOnes();
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth; for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0; IsLower ? k2<depth : k2>0;

View File

@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum { enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower IsLower = (Mode&Lower) == Lower
}; };
@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj; ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar, Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel; ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs; ei_gemm_pack_lhs<Scalar, Index, Traits::mr,TriStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
for(Index k2=IsLower ? 0 : size; for(Index k2=IsLower ? 0 : size;
IsLower ? k2<size : k2>0; IsLower ? k2<size : k2>0;
@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; typedef ei_gebp_traits<Scalar,Scalar> Traits;
enum { enum {
RhsStorageOrder = TriStorageOrder, RhsStorageOrder = TriStorageOrder,
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower IsLower = (Mode&Lower) == Lower
}; };
// Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction // Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction
// Index mc = std::min<Index>(Blocking::Max_mc,size); // cache block size along the M direction // Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction
// check that !!!! // check that !!!!
Index kc = size; // cache block size along the K direction Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction Index mc = size; // cache block size along the M direction
@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*ei_packet_traits<Scalar>::size*Blocking::nr + kc*size; std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*ei_packet_traits<Scalar>::size*Blocking::nr; Scalar* blockB = allocatedBlockB + sizeW;
ei_conj_if<Conjugate> conj; ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar,Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel; ei_gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel; ei_gemm_pack_lhs<Scalar, Index, Traits::mr, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0; for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size; IsLower ? k2>0 : k2<size;

View File

@ -151,39 +151,6 @@ class ei_const_blas_data_mapper
Index m_stride; Index m_stride;
}; };
// Defines various constant controlling register blocking for matrix-matrix algorithms.
template<typename LhsScalar, typename RhsScalar> struct ei_product_blocking_traits;
template<typename LhsScalar, typename RhsScalar>
struct ei_product_blocking_traits
{
enum {
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
&& ei_packet_traits<RhsScalar>::Vectorizable
/*&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4)
nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified)
mr = 2 * LhsPacketSize
};
};
template<typename Real>
struct ei_product_blocking_traits<std::complex<Real>, std::complex<Real> >
{
typedef std::complex<Real> Scalar;
enum {
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
PacketSize = ei_packet_traits<Scalar>::size,
nr = 2,
mr = 2 * PacketSize
};
};
/* Helper class to analyze the factors of a Product expression. /* Helper class to analyze the factors of a Product expression.
* In particular it allows to pop out operator-, scalar multiples, * In particular it allows to pop out operator-, scalar multiples,