quick temporary fix for a perf issue we just identified with

vectorization....
now the sum benchmark runs 3x faster with vectorization than without.
This commit is contained in:
Benoit Jacob 2008-06-23 11:23:05 +00:00
parent 32596c5e9e
commit 03d19f3bae

View File

@ -194,9 +194,22 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
// do the vectorizable part of the sum // do the vectorizable part of the sum
if(size >= packetSize) if(size >= packetSize)
{ {
asm("#begin");
PacketScalar packet_res; PacketScalar packet_res;
packet_res = mat.template packet<Aligned>(0, 0); packet_res = mat.template packet<Aligned>(0, 0);
int index; int index;
if(Derived::IsVectorAtCompileTime)
{
for(index = packetSize; index<alignedSize ; index+=packetSize)
{
const int row = Derived::RowsAtCompileTime==1 ? 0 : index;
const int col = Derived::RowsAtCompileTime==1 ? index : 0;
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
}
}
else
{
for(index = packetSize; index<alignedSize ; index+=packetSize) for(index = packetSize; index<alignedSize ; index+=packetSize)
{ {
// FIXME the following is not really efficient // FIXME the following is not really efficient
@ -204,7 +217,9 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
const int col = rowMajor ? index%innerSize : index/innerSize; const int col = rowMajor ? index%innerSize : index/innerSize;
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col)); packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
} }
}
res = ei_predux(packet_res); res = ei_predux(packet_res);
asm("#end");
// now we must do the rest without vectorization. // now we must do the rest without vectorization.
if(alignedSize == size) return res; if(alignedSize == size) return res;