Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path.

This commit is contained in:
Benoit Steiner 2016-05-23 15:13:16 -07:00
parent 7aa5bc9558
commit 5d51a7f12c

View File

@ -1625,9 +1625,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
prefetch(&blA[0]); prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
if( (SwappedTraits::LhsProgress % 4)==0 ) // The following piece of code wont work for 512 bit registers
// Moreover it assumes that there is a half packet of the same size
// as nr (which is currently 4) for the return type.
typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
if ((SwappedTraits::LhsProgress % 4) == 0 &&
(SwappedTraits::LhsProgress <= 8) &&
unpacket_traits<SResPacketHalf>::size==4)
{ {
// NOTE The following piece of code wont work for 512 bit registers
SAccPacket C0, C1, C2, C3; SAccPacket C0, C1, C2, C3;
straits.initAcc(C0); straits.initAcc(C0);
straits.initAcc(C1); straits.initAcc(C1);