diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index 7421b7012..6d497d326 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -46,8 +46,9 @@ template struct ei_compute_inverse_size4 { enum { - MatrixAlignment = bool(MatrixType::Flags&AlignedBit), - ResultAlignment = bool(ResultType::Flags&AlignedBit) + MatrixAlignment = bool(MatrixType::Flags&AlignedBit), + ResultAlignment = bool(ResultType::Flags&AlignedBit), + StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) }; static void run(const MatrixType& matrix, ResultType& result) @@ -66,10 +67,21 @@ struct ei_compute_inverse_size4( 0)), B1(matrix.template packet( 2)), - A2(matrix.template packet( 4)), B2(matrix.template packet( 6)), - C1(matrix.template packet( 8)), D1(matrix.template packet(10)), - C2(matrix.template packet(12)), D2(matrix.template packet(14)); + __m128d A1, A2, B1, B2, C1, C2, D1, D2; + + if(StorageOrdersMatch) + { + A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); + A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); + C1 = matrix.template packet( 8); D1 = matrix.template packet(10); + C2 = matrix.template packet(12); D2 = matrix.template packet(14); + } + else + { + __m128d tmp; + A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); + A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); + tmp = A1; + A1 = _mm_unpacklo_pd(A1,A2); + A2 = _mm_unpackhi_pd(tmp,A2); + tmp = C1; + C1 = _mm_unpacklo_pd(C1,C2); + C2 = _mm_unpackhi_pd(tmp,C2); + + B1 = matrix.template packet( 8); D1 = matrix.template packet(10); + B2 = matrix.template packet(12); D2 = matrix.template packet(14); + tmp = B1; + B1 = _mm_unpacklo_pd(B1,B2); + B2 = _mm_unpackhi_pd(tmp,B2); + tmp = D1; + D1 = _mm_unpacklo_pd(D1,D2); + D2 = _mm_unpackhi_pd(tmp,D2); + } + __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices DC1, DC2, AB1, AB2; __m128d dA, dB, dC, dD; // determinant of the sub-matrices