optimize gemv for complex<double> and fix gcc alignment issue in 32bits

This commit is contained in:
Gael Guennebaud 2010-07-07 15:28:41 +02:00
parent e07c0f6bb5
commit 845994f18f

View File

@ -88,7 +88,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
// find how many columns do we have to skip to be aligned with the result (if possible) // find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0; Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(res)%sizeof(RealScalar)) ) if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) )
{ {
alignedSize = 0; alignedSize = 0;
alignedStart = 0; alignedStart = 0;
@ -117,6 +117,12 @@ void ei_cache_friendly_product_colmajor_times_vector(
|| PacketSize > size || PacketSize > size
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
} }
else if(Vectorizable)
{
alignedStart = 0;
alignedSize = size;
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index offset3 = (FirstAligned && alignmentStep==1?1:3);
@ -305,7 +311,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
// find how many rows do we have to skip to be aligned with rhs (if possible) // find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0; Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(rhs)%sizeof(RealScalar)) ) if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) )
{ {
alignedSize = 0; alignedSize = 0;
alignedStart = 0; alignedStart = 0;
@ -334,6 +340,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
|| PacketSize > rhsSize || PacketSize > rhsSize
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
} }
else if(Vectorizable)
{
alignedStart = 0;
alignedSize = size;
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index offset3 = (FirstAligned && alignmentStep==1?1:3);
@ -341,7 +353,8 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{ {
Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
// this helps the compiler generating good binary code // this helps the compiler generating good binary code
const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
@ -442,7 +455,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
{ {
for (Index i=start; i<end; ++i) for (Index i=start; i<end; ++i)
{ {
Scalar tmp0 = Scalar(0); EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
Packet ptmp0 = ei_pset1(tmp0); Packet ptmp0 = ei_pset1(tmp0);
const Scalar* lhs0 = lhs + i*lhsStride; const Scalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs // process first unaligned result's coeffs