fix 4x4 SSE inversion when storage orders don't match

This commit is contained in:
Gael Guennebaud 2010-08-24 13:00:59 +02:00
parent 548ecc2fe5
commit a47bbf664c

View File

@ -46,8 +46,9 @@ template<typename MatrixType, typename ResultType>
struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType> struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
{ {
enum { enum {
MatrixAlignment = bool(MatrixType::Flags&AlignedBit), MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
ResultAlignment = bool(ResultType::Flags&AlignedBit) ResultAlignment = bool(ResultType::Flags&AlignedBit),
StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
}; };
static void run(const MatrixType& matrix, ResultType& result) static void run(const MatrixType& matrix, ResultType& result)
@ -66,10 +67,21 @@ struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType
// represented as a registers. Hence we get a better locality of the // represented as a registers. Hence we get a better locality of the
// calculations. // calculations.
__m128 A = _mm_movelh_ps(_L1, _L2), // the four sub-matrices __m128 A, B, C, D; // the four sub-matrices
B = _mm_movehl_ps(_L2, _L1), if(!StorageOrdersMatch)
C = _mm_movelh_ps(_L3, _L4), {
D = _mm_movehl_ps(_L4, _L3); A = _mm_unpacklo_ps(_L1, _L2);
B = _mm_unpacklo_ps(_L3, _L4);
C = _mm_unpackhi_ps(_L1, _L2);
D = _mm_unpackhi_ps(_L3, _L4);
}
else
{
A = _mm_movelh_ps(_L1, _L2);
B = _mm_movehl_ps(_L2, _L1);
C = _mm_movelh_ps(_L3, _L4);
D = _mm_movehl_ps(_L4, _L3);
}
__m128 iA, iB, iC, iD, // partial inverse of the sub-matrices __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices
DC, AB; DC, AB;
@ -163,7 +175,8 @@ struct ei_compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultTyp
{ {
enum { enum {
MatrixAlignment = bool(MatrixType::Flags&AlignedBit), MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
ResultAlignment = bool(ResultType::Flags&AlignedBit) ResultAlignment = bool(ResultType::Flags&AlignedBit),
StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
}; };
static void run(const MatrixType& matrix, ResultType& result) static void run(const MatrixType& matrix, ResultType& result)
{ {
@ -177,10 +190,37 @@ struct ei_compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultTyp
// calculations. // calculations.
// the four sub-matrices // the four sub-matrices
__m128d A1(matrix.template packet<MatrixAlignment>( 0)), B1(matrix.template packet<MatrixAlignment>( 2)), __m128d A1, A2, B1, B2, C1, C2, D1, D2;
A2(matrix.template packet<MatrixAlignment>( 4)), B2(matrix.template packet<MatrixAlignment>( 6)),
C1(matrix.template packet<MatrixAlignment>( 8)), D1(matrix.template packet<MatrixAlignment>(10)), if(StorageOrdersMatch)
C2(matrix.template packet<MatrixAlignment>(12)), D2(matrix.template packet<MatrixAlignment>(14)); {
A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
}
else
{
__m128d tmp;
A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
tmp = A1;
A1 = _mm_unpacklo_pd(A1,A2);
A2 = _mm_unpackhi_pd(tmp,A2);
tmp = C1;
C1 = _mm_unpacklo_pd(C1,C2);
C2 = _mm_unpackhi_pd(tmp,C2);
B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
tmp = B1;
B1 = _mm_unpacklo_pd(B1,B2);
B2 = _mm_unpackhi_pd(tmp,B2);
tmp = D1;
D1 = _mm_unpacklo_pd(D1,D2);
D2 = _mm_unpackhi_pd(tmp,D2);
}
__m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices
DC1, DC2, AB1, AB2; DC1, DC2, AB1, AB2;
__m128d dA, dB, dC, dD; // determinant of the sub-matrices __m128d dA, dB, dC, dD; // determinant of the sub-matrices