mixing types step 3:

- improve support of colmajor by vector and matrix - matrix
- now all configurations are well handled, but the perf are not always very good
This commit is contained in:
Gael Guennebaud 2010-07-11 23:57:23 +02:00
parent 8e3c4283f5
commit f8678272a4
6 changed files with 122 additions and 38 deletions

View File

@ -326,6 +326,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
typedef typename ProductType::LhsScalar LhsScalar; typedef typename ProductType::LhsScalar LhsScalar;
typedef typename ProductType::RhsScalar RhsScalar; typedef typename ProductType::RhsScalar RhsScalar;
typedef typename ProductType::Scalar ResScalar; typedef typename ProductType::Scalar ResScalar;
typedef typename ProductType::RealScalar RealScalar;
typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualLhsType ActualLhsType;
typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::ActualRhsType ActualRhsType;
typedef typename ProductType::LhsBlasTraits LhsBlasTraits; typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
@ -340,15 +341,29 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
enum { enum {
// FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1 // FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1
EvalToDest = Dest::InnerStrideAtCompileTime==1 EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex)
}; };
bool alphaIsCompatible = (!ComplexByReal) || (ei_imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = ei_get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ResScalar* actualDest; ResScalar* actualDest;
if (EvalToDest) if (evalToDest)
{
actualDest = &dest.coeffRef(0); actualDest = &dest.coeffRef(0);
}
else else
{ {
actualDest = ei_aligned_stack_new(ResScalar,dest.size()); actualDest = ei_aligned_stack_new(ResScalar,dest.size());
if(!alphaIsCompatible)
{
MappedDest(actualDest, dest.size()).setZero();
compatibleAlpha = RhsScalar(1);
}
else
MappedDest(actualDest, dest.size()) = dest; MappedDest(actualDest, dest.size()) = dest;
} }
@ -358,10 +373,13 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
&actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
actualRhs.data(), actualRhs.innerStride(), actualRhs.data(), actualRhs.innerStride(),
actualDest, 1, actualDest, 1,
actualAlpha); compatibleAlpha);
if (!EvalToDest) if (!evalToDest)
{ {
if(!alphaIsCompatible)
dest += actualAlpha * MappedDest(actualDest, dest.size());
else
dest = MappedDest(actualDest, dest.size()); dest = MappedDest(actualDest, dest.size());
ei_aligned_stack_delete(ResScalar, actualDest, dest.size()); ei_aligned_stack_delete(ResScalar, actualDest, dest.size());
} }

View File

@ -207,6 +207,15 @@ template<> struct ei_conj_helper<Packet4f, Packet2cf, false,false>
{ return Packet2cf(ei_pmul(x, y.v)); } { return Packet2cf(ei_pmul(x, y.v)); }
}; };
template<> struct ei_conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return ei_padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(ei_pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
// TODO optimize it for SSE3 and 4 // TODO optimize it for SSE3 and 4

View File

@ -140,17 +140,26 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
#ifdef EIGEN_HAS_FUSE_CJMADD #ifdef EIGEN_HAS_FUSE_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else #else
#define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T)); #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
#endif #endif
// optimized GEneral packed Block * packed Panel product kernel /* optimized GEneral packed Block * packed Panel product kernel
*
* Mixing type logic: C += A * B
* | A | B | comments
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct ei_gebp_kernel struct ei_gebp_kernel
{ {
typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum { enum {
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable, Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable /*ei_packet_traits<LhsScalar>::Vectorizable
&& ei_packet_traits<RhsScalar>::Vectorizable
&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
@ -766,17 +775,23 @@ template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjuga
struct ei_gemm_pack_lhs struct ei_gemm_pack_lhs
{ {
void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
Index stride=0, Index offset=0) Scalar alpha = Scalar(1), Index stride=0, Index offset=0)
{ {
enum { PacketSize = ei_packet_traits<Scalar>::size }; enum { PacketSize = ei_packet_traits<Scalar>::size };
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
bool hasAlpha = alpha != Scalar(1);
Index count = 0; Index count = 0;
Index peeled_mc = (rows/mr)*mr; Index peeled_mc = (rows/mr)*mr;
for(Index i=0; i<peeled_mc; i+=mr) for(Index i=0; i<peeled_mc; i+=mr)
{ {
if(PanelMode) count += mr * offset; if(PanelMode) count += mr * offset;
if(hasAlpha)
for(Index k=0; k<depth; k++)
for(Index w=0; w<mr; w++)
blockA[count++] = alpha * cj(lhs(i+w, k));
else
for(Index k=0; k<depth; k++) for(Index k=0; k<depth; k++)
for(Index w=0; w<mr; w++) for(Index w=0; w<mr; w++)
blockA[count++] = cj(lhs(i+w, k)); blockA[count++] = cj(lhs(i+w, k));
@ -785,6 +800,11 @@ struct ei_gemm_pack_lhs
if(rows-peeled_mc>=PacketSize) if(rows-peeled_mc>=PacketSize)
{ {
if(PanelMode) count += PacketSize*offset; if(PanelMode) count += PacketSize*offset;
if(hasAlpha)
for(Index k=0; k<depth; k++)
for(Index w=0; w<PacketSize; w++)
blockA[count++] = alpha * cj(lhs(peeled_mc+w, k));
else
for(Index k=0; k<depth; k++) for(Index k=0; k<depth; k++)
for(Index w=0; w<PacketSize; w++) for(Index w=0; w<PacketSize; w++)
blockA[count++] = cj(lhs(peeled_mc+w, k)); blockA[count++] = cj(lhs(peeled_mc+w, k));
@ -794,6 +814,10 @@ struct ei_gemm_pack_lhs
for(Index i=peeled_mc; i<rows; i++) for(Index i=peeled_mc; i<rows; i++)
{ {
if(PanelMode) count += offset; if(PanelMode) count += offset;
if(hasAlpha)
for(Index k=0; k<depth; k++)
blockA[count++] = alpha * cj(lhs(i, k));
else
for(Index k=0; k<depth; k++) for(Index k=0; k<depth; k++)
blockA[count++] = cj(lhs(i, k)); blockA[count++] = cj(lhs(i, k));
if(PanelMode) count += (stride-offset-depth); if(PanelMode) count += (stride-offset-depth);

View File

@ -74,6 +74,7 @@ static void run(Index rows, Index cols, Index depth,
ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
bool alphaOnLhs = NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex;
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
@ -86,7 +87,7 @@ static void run(Index rows, Index cols, Index depth,
ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp; ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp;
// if (ConjugateRhs) // if ((ConjugateRhs && !alphaOnLhs) || (ConjugateLhs && alphaOnLhs))
// alpha = ei_conj(alpha); // alpha = ei_conj(alpha);
// ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; // ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
// ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; // ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
@ -177,6 +178,9 @@ static void run(Index rows, Index cols, Index depth,
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
LhsScalar lhsAlpha = alphaOnLhs ? ei_get_factor<ResScalar,LhsScalar>::run(alpha) : LhsScalar(1);
RhsScalar rhsAlpha = alphaOnLhs ? RhsScalar(1) : ei_get_factor<ResScalar,RhsScalar>::run(alpha);
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1) // (==GEMM_VAR1)
for(Index k2=0; k2<depth; k2+=kc) for(Index k2=0; k2<depth; k2+=kc)
@ -187,7 +191,7 @@ static void run(Index rows, Index cols, Index depth,
// => Pack rhs's panel into a sequential chunk of memory (L2 caching) // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's // Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number. // vertical panel which is, in practice, a very low number.
pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); pack_rhs(blockB, &rhs(k2,0), rhsStride, rhsAlpha, actual_kc, cols);
// For each mc x kc block of the lhs's vertical panel... // For each mc x kc block of the lhs's vertical panel...
@ -199,7 +203,7 @@ static void run(Index rows, Index cols, Index depth,
// We pack the lhs's block into a sequential chunk of memory (L1 caching) // We pack the lhs's block into a sequential chunk of memory (L1 caching)
// Note that this block will be read a very high number of times, which is equal to the number of // Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times). // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc, lhsAlpha);
// Everything is packed, we can now call the block * panel kernel: // Everything is packed, we can now call the block * panel kernel:
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW); gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW);

View File

@ -30,7 +30,13 @@
* the number of load/stores of the result by a factor 4 and to reduce * the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the * the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern. * same alignment pattern.
* TODO: since rhs gets evaluated only once, no need to evaluate it *
* Mixing type logic: C += alpha * A * B
* | A | B |alpha| comments
* |real |cplx |cplx | no vectorization
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*/ */
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
@ -39,7 +45,7 @@ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResS
enum { enum {
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
&& ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size, && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size),
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
@ -58,7 +64,7 @@ EIGEN_DONT_INLINE static void run(
const LhsScalar* lhs, Index lhsStride, const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr, const RhsScalar* rhs, Index rhsIncr,
ResScalar* res, Index resIncr, ResScalar* res, Index resIncr,
ResScalar alpha) RhsScalar alpha)
{ {
ei_internal_assert(resIncr==1); ei_internal_assert(resIncr==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS #ifdef _EIGEN_ACCUMULATE_PACKETS
@ -182,6 +188,7 @@ EIGEN_DONT_INLINE static void run(
if(peels>1) if(peels>1)
{ {
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
ResPacket T0, T1;
A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
@ -195,20 +202,20 @@ EIGEN_DONT_INLINE static void run(
A00 = ei_pload<LhsPacket>(&lhs0[j]); A00 = ei_pload<LhsPacket>(&lhs0[j]);
A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]); A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); T0 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j]));
A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); T1 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize]));
A00 = pcj.pmadd(A01, ptmp1, A00); T0 = pcj.pmadd(A01, ptmp1, T0);
A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01);
A00 = pcj.pmadd(A02, ptmp2, A00); T0 = pcj.pmadd(A02, ptmp2, T0);
A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02);
A00 = pcj.pmadd(A03, ptmp3, A00); T0 = pcj.pmadd(A03, ptmp3, T0);
ei_pstore(&res[j],A00); ei_pstore(&res[j],T0);
A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03);
A10 = pcj.pmadd(A11, ptmp1, A10); T1 = pcj.pmadd(A11, ptmp1, T1);
A10 = pcj.pmadd(A12, ptmp2, A10); T1 = pcj.pmadd(A12, ptmp2, T1);
A10 = pcj.pmadd(A13, ptmp3, A10); T1 = pcj.pmadd(A13, ptmp3, T1);
ei_pstore(&res[j+ResPacketSize],A10); ei_pstore(&res[j+ResPacketSize],T1);
} }
} }
for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
@ -275,6 +282,16 @@ EIGEN_DONT_INLINE static void run(
} }
}; };
/* Optimized row-major matrix * vector product:
* This algorithm processes 4 rows at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
*
* Mixing type logic:
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
{ {

View File

@ -115,6 +115,13 @@ template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::c
{ return x*ei_conj_if<Conj>()(y); } { return x*ei_conj_if<Conj>()(y); }
}; };
template<typename From,typename To> struct ei_get_factor {
EIGEN_STRONG_INLINE static To run(const From& x) { return x; }
};
template<typename Scalar> struct ei_get_factor<Scalar,typename NumTraits<Scalar>::Real> {
EIGEN_STRONG_INLINE static typename NumTraits<Scalar>::Real run(const Scalar& x) { return ei_real(x); }
};
// Lightweight helper class to access matrix coefficients. // Lightweight helper class to access matrix coefficients.
// Yes, this is somehow redundant with Map<>, but this version is much much lighter, // Yes, this is somehow redundant with Map<>, but this version is much much lighter,
@ -151,7 +158,11 @@ template<typename LhsScalar, typename RhsScalar>
struct ei_product_blocking_traits struct ei_product_blocking_traits
{ {
enum { enum {
LhsPacketSize = ei_packet_traits<LhsScalar>::size, Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
&& ei_packet_traits<RhsScalar>::Vectorizable
&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex)),
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4) // register block size along the N direction (must be either 2 or 4)
@ -167,6 +178,7 @@ struct ei_product_blocking_traits<std::complex<Real>, std::complex<Real> >
{ {
typedef std::complex<Real> Scalar; typedef std::complex<Real> Scalar;
enum { enum {
Vectorizable = ei_packet_traits<Scalar>::Vectorizable,
PacketSize = ei_packet_traits<Scalar>::size, PacketSize = ei_packet_traits<Scalar>::size,
nr = 2, nr = 2,
mr = 2 * PacketSize mr = 2 * PacketSize