Fix -Waggressive-loop-optimizations

This commit is contained in:
Charles Schlosser 2023-07-21 03:47:40 +00:00
parent 6e7abeae69
commit 4e9e493b4a

View File

@ -361,6 +361,10 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
}; };
const Index fullColBlockEnd = LhsPacketSize * (cols / LhsPacketSize);
const Index halfColBlockEnd = LhsPacketSizeHalf * (cols / LhsPacketSizeHalf);
const Index quarterColBlockEnd = LhsPacketSizeQuarter * (cols / LhsPacketSizeQuarter);
Index i=0; Index i=0;
for(; i<n8; i+=8) for(; i<n8; i+=8)
{ {
@ -373,8 +377,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
c6 = pset1<ResPacket>(ResScalar(0)), c6 = pset1<ResPacket>(ResScalar(0)),
c7 = pset1<ResPacket>(ResScalar(0)); c7 = pset1<ResPacket>(ResScalar(0));
Index j=0; for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
{ {
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@ -395,7 +398,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
ResScalar cc5 = predux(c5); ResScalar cc5 = predux(c5);
ResScalar cc6 = predux(c6); ResScalar cc6 = predux(c6);
ResScalar cc7 = predux(c7); ResScalar cc7 = predux(c7);
for(; j<cols; ++j)
for (Index j = fullColBlockEnd; j < cols; ++j)
{ {
RhsScalar b0 = rhs(j,0); RhsScalar b0 = rhs(j,0);
@ -424,8 +428,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
c2 = pset1<ResPacket>(ResScalar(0)), c2 = pset1<ResPacket>(ResScalar(0)),
c3 = pset1<ResPacket>(ResScalar(0)); c3 = pset1<ResPacket>(ResScalar(0));
Index j=0; for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
{ {
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@ -438,7 +441,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
ResScalar cc1 = predux(c1); ResScalar cc1 = predux(c1);
ResScalar cc2 = predux(c2); ResScalar cc2 = predux(c2);
ResScalar cc3 = predux(c3); ResScalar cc3 = predux(c3);
for(; j<cols; ++j)
for(Index j = fullColBlockEnd; j < cols; ++j)
{ {
RhsScalar b0 = rhs(j,0); RhsScalar b0 = rhs(j,0);
@ -457,8 +461,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
c1 = pset1<ResPacket>(ResScalar(0)); c1 = pset1<ResPacket>(ResScalar(0));
Index j=0; for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
{ {
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@ -467,7 +470,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
} }
ResScalar cc0 = predux(c0); ResScalar cc0 = predux(c0);
ResScalar cc1 = predux(c1); ResScalar cc1 = predux(c1);
for(; j<cols; ++j)
for(Index j = fullColBlockEnd; j < cols; ++j)
{ {
RhsScalar b0 = rhs(j,0); RhsScalar b0 = rhs(j,0);
@ -482,15 +486,15 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
ResPacket c0 = pset1<ResPacket>(ResScalar(0)); ResPacket c0 = pset1<ResPacket>(ResScalar(0));
ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0)); ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0)); ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
Index j=0;
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
{ {
RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0); RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0); c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
} }
ResScalar cc0 = predux(c0); ResScalar cc0 = predux(c0);
if (HasHalf) { if (HasHalf) {
for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf)
{ {
RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0); RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h); c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
@ -498,14 +502,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
cc0 += predux(c0_h); cc0 += predux(c0_h);
} }
if (HasQuarter) { if (HasQuarter) {
for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter)
{ {
RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0); RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q); c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
} }
cc0 += predux(c0_q); cc0 += predux(c0_q);
} }
for(; j<cols; ++j) for (Index j = quarterColBlockEnd; j < cols; ++j)
{ {
cc0 += cj.pmul(lhs(i,j), rhs(j,0)); cc0 += cj.pmul(lhs(i,j), rhs(j,0));
} }