diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index 60b7a2376..86980b4b0 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -151,10 +151,12 @@ struct compute_inverse_size4 iC = _mm_mul_ps(rd,iC); iD = _mm_mul_ps(rd,iD); - result.template writePacket( 0, _mm_shuffle_ps(iA,iB,0x77)); - result.template writePacket( 4, _mm_shuffle_ps(iA,iB,0x22)); - result.template writePacket( 8, _mm_shuffle_ps(iC,iD,0x77)); - result.template writePacket(12, _mm_shuffle_ps(iC,iD,0x22)); + Index res_stride = result.outerStride(); + float* res = result.data(); + pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); + pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); + pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); + pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); } }; @@ -311,14 +313,16 @@ struct compute_inverse_size4 iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - result.template writePacket( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); // iA# / det - result.template writePacket( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - result.template writePacket( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); // iB# / det - result.template writePacket( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - result.template writePacket( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); // iC# / det - result.template writePacket(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - result.template writePacket(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); // iD# / det - result.template writePacket(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); + Index res_stride = result.outerStride(); + double* res = result.data(); + pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); + pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); + pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); + pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); + pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); + pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); + pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); + pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); } }; diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index c4ef2d4bd..eb6ad18c9 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -53,14 +53,29 @@ template void inverse_general_4x4(int repeat) // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21. VERIFY(error_avg < (NumTraits::IsComplex ? 8.0 : 1.25)); VERIFY(error_max < (NumTraits::IsComplex ? 64.0 : 20.0)); + + { + int s = 5;//internal::random(4,10); + int i = 0;//internal::random(0,s-4); + int j = 0;//internal::random(0,s-4); + Matrix mat(s,s); + mat.setRandom(); + MatrixType submat = mat.template block<4,4>(i,j); + MatrixType mat_inv = mat.template block<4,4>(i,j).inverse(); + VERIFY_IS_APPROX(mat_inv, submat.inverse()); + mat.template block<4,4>(i,j) = submat.inverse(); + VERIFY_IS_APPROX(mat_inv, (mat.template block<4,4>(i,j))); + } } void test_prec_inverse_4x4() { CALL_SUBTEST_1((inverse_permutation_4x4())); CALL_SUBTEST_1(( inverse_general_4x4(200000 * g_repeat) )); + CALL_SUBTEST_1(( inverse_general_4x4 >(200000 * g_repeat) )); CALL_SUBTEST_2((inverse_permutation_4x4 >())); + CALL_SUBTEST_2(( inverse_general_4x4 >(200000 * g_repeat) )); CALL_SUBTEST_2(( inverse_general_4x4 >(200000 * g_repeat) )); CALL_SUBTEST_3((inverse_permutation_4x4()));