Fix performance regression in dgemm introduced by changeset 5d51a7f12c69138ed2a43df240bdf27a5313f7ce

This commit is contained in:
Gael Guennebaud 2016-07-02 17:35:08 +02:00
parent 672076db5d
commit 0fa9e4a15c

View File

@ -1526,12 +1526,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
// The following piece of code wont work for 512 bit registers // The following piece of code wont work for 512 bit registers
// Moreover it assumes that there is a half packet of the same size // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
// as nr (which is currently 4) for the return type. // as nr (which is currently 4) for the return type.
typedef typename unpacket_traits<SResPacket>::half SResPacketHalf; typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
if ((SwappedTraits::LhsProgress % 4) == 0 && if ((SwappedTraits::LhsProgress % 4) == 0 &&
(SwappedTraits::LhsProgress <= 8) && (SwappedTraits::LhsProgress <= 8) &&
unpacket_traits<SResPacketHalf>::size==4) (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
{ {
SAccPacket C0, C1, C2, C3; SAccPacket C0, C1, C2, C3;
straits.initAcc(C0); straits.initAcc(C0);