From 4262117f84e5a2bc117f0553870fbf6e9cfdf6b5 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 15 Dec 2009 08:16:48 -0500 Subject: [PATCH] backport 4x4 inverse changes: - use cofactors - use Intel's SSE code in the float case --- Eigen/src/LU/Inverse.h | 210 ++++++++++++++++++++++++-------------- test/prec_inverse_4x4.cpp | 10 +- 2 files changed, 135 insertions(+), 85 deletions(-) diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/Inverse.h index 448980ff0..3728cbdc5 100644 --- a/Eigen/src/LU/Inverse.h +++ b/Eigen/src/LU/Inverse.h @@ -75,93 +75,147 @@ void ei_compute_inverse_in_size3_case(const MatrixType& matrix, MatrixType* resu result->coeffRef(2, 2) = matrix.minor(2,2).determinant() * invdet; } -template -bool ei_compute_inverse_in_size4_case_helper(const MatrixType& matrix, MatrixType* result) +template +struct ei_compute_inverse_in_size4_case { - /* Let's split M into four 2x2 blocks: - * (P Q) - * (R S) - * If P is invertible, with inverse denoted by P_inverse, and if - * (S - R*P_inverse*Q) is also invertible, then the inverse of M is - * (P' Q') - * (R' S') - * where - * S' = (S - R*P_inverse*Q)^(-1) - * P' = P1 + (P1*Q) * S' *(R*P_inverse) - * Q' = -(P_inverse*Q) * S' - * R' = -S' * (R*P_inverse) - */ - typedef Block XprBlock22; - typedef typename MatrixBase::PlainMatrixType Block22; - Block22 P_inverse; - ei_compute_inverse_in_size2_case(matrix.template block<2,2>(0,0), &P_inverse); - const Block22 Q = matrix.template block<2,2>(0,2); - const Block22 P_inverse_times_Q = P_inverse * Q; - const XprBlock22 R = matrix.template block<2,2>(2,0); - const Block22 R_times_P_inverse = R * P_inverse; - const Block22 R_times_P_inverse_times_Q = R_times_P_inverse * Q; - const XprBlock22 S = matrix.template block<2,2>(2,2); - const Block22 X = S - R_times_P_inverse_times_Q; - Block22 Y; - ei_compute_inverse_in_size2_case(X, &Y); - result->template block<2,2>(2,2) = Y; - result->template block<2,2>(2,0) = - Y * R_times_P_inverse; - const Block22 Z = P_inverse_times_Q * Y; - result->template block<2,2>(0,2) = - Z; - result->template block<2,2>(0,0) = P_inverse + Z * R_times_P_inverse; - return true; -} + static void run(const MatrixType& matrix, MatrixType& result) + { + result.coeffRef(0,0) = matrix.minor(0,0).determinant(); + result.coeffRef(1,0) = -matrix.minor(0,1).determinant(); + result.coeffRef(2,0) = matrix.minor(0,2).determinant(); + result.coeffRef(3,0) = -matrix.minor(0,3).determinant(); + result.coeffRef(0,2) = matrix.minor(2,0).determinant(); + result.coeffRef(1,2) = -matrix.minor(2,1).determinant(); + result.coeffRef(2,2) = matrix.minor(2,2).determinant(); + result.coeffRef(3,2) = -matrix.minor(2,3).determinant(); + result.coeffRef(0,1) = -matrix.minor(1,0).determinant(); + result.coeffRef(1,1) = matrix.minor(1,1).determinant(); + result.coeffRef(2,1) = -matrix.minor(1,2).determinant(); + result.coeffRef(3,1) = matrix.minor(1,3).determinant(); + result.coeffRef(0,3) = -matrix.minor(3,0).determinant(); + result.coeffRef(1,3) = matrix.minor(3,1).determinant(); + result.coeffRef(2,3) = -matrix.minor(3,2).determinant(); + result.coeffRef(3,3) = matrix.minor(3,3).determinant(); + result /= (matrix.col(0).cwise()*result.row(0).transpose()).sum(); + } +}; +#ifdef EIGEN_VECTORIZE_SSE template -void ei_compute_inverse_in_size4_case(const MatrixType& _matrix, MatrixType* result) +struct ei_compute_inverse_in_size4_case { - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; + static void run(const MatrixType& matrix, MatrixType& result) + { + // Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the + // lines of the inverted matrix. + __m128 minor0, minor1, minor2, minor3; - // we will do row permutations on the matrix. This copy should have negligible cost. - // if not, consider working in-place on the matrix (const-cast it, but then undo the permutations - // to nevertheless honor constness) - typename MatrixType::PlainMatrixType matrix(_matrix); + // Variables which will contain the lines of the reference matrix and, later (after the transposition), + // the columns of the original matrix. + __m128 row0, row1, row2, row3; - // let's extract from the 2 first colums a 2x2 block whose determinant is as big as possible. - int good_row0, good_row1, good_i; - Matrix absdet; + // Temporary variables and the variable that will contain the matrix determinant. + __m128 det, tmp1; - // any 2x2 block with determinant above this threshold will be considered good enough. - // The magic value 1e-1 here comes from experimentation. The bigger it is, the higher the precision, - // the slower the computation. This value 1e-1 gives precision almost as good as the brutal cofactors - // algorithm, both in average and in worst-case precision. - RealScalar d = (matrix.col(0).squaredNorm()+matrix.col(1).squaredNorm()) * RealScalar(1e-1); - #define ei_inv_size4_helper_macro(i,row0,row1) \ - absdet[i] = ei_abs(matrix.coeff(row0,0)*matrix.coeff(row1,1) \ - - matrix.coeff(row0,1)*matrix.coeff(row1,0)); \ - if(absdet[i] > d) { good_row0=row0; good_row1=row1; goto good; } - ei_inv_size4_helper_macro(0,0,1) - ei_inv_size4_helper_macro(1,0,2) - ei_inv_size4_helper_macro(2,0,3) - ei_inv_size4_helper_macro(3,1,2) - ei_inv_size4_helper_macro(4,1,3) - ei_inv_size4_helper_macro(5,2,3) + // Matrix transposition + const float *src = matrix.data(); + tmp1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)src)), (__m64*)(src+ 4)); + row1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+8))), (__m64*)(src+12)); + row0 = _mm_shuffle_ps(tmp1, row1, 0x88); + row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); + tmp1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+ 2))), (__m64*)(src+ 6)); + row3 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+10))), (__m64*)(src+14)); + row2 = _mm_shuffle_ps(tmp1, row3, 0x88); + row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); - // no 2x2 block has determinant bigger than the threshold. So just take the one that - // has the biggest determinant - absdet.maxCoeff(&good_i); - good_row0 = good_i <= 2 ? 0 : good_i <= 4 ? 1 : 2; - good_row1 = good_i <= 2 ? good_i+1 : good_i <= 4 ? good_i-1 : 3; - // now good_row0 and good_row1 are correctly set - good: + // Cofactors calculation. Because in the process of cofactor computation some pairs in three- + // element products are repeated, it is not reasonable to load these pairs anew every time. The + // values in the registers with these pairs are formed using shuffle instruction. Cofactors are + // calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register). - // do row permutations to move this 2x2 block to the top - matrix.row(0).swap(matrix.row(good_row0)); - matrix.row(1).swap(matrix.row(good_row1)); - // now applying our helper function is numerically stable - ei_compute_inverse_in_size4_case_helper(matrix, result); - // Since we did row permutations on the original matrix, we need to do column permutations - // in the reverse order on the inverse - result->col(1).swap(result->col(good_row1)); - result->col(0).swap(result->col(good_row0)); -} + tmp1 = _mm_mul_ps(row2, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_mul_ps(row1, tmp1); + minor1 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); + minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); + minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row1, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); + minor3 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); + minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + row2 = _mm_shuffle_ps(row2, row2, 0x4E); + minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); + minor2 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); + minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); + minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); + + // Evaluation of determinant and its reciprocal value. In the original Intel document, + // 1/det was evaluated using a fast rcpps command with subsequent approximation using + // the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead, + // so as to not compromise precision at all. + det = _mm_mul_ps(row0, minor0); + det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); + det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); +// tmp1= _mm_rcp_ss(det); +// det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); + det = _mm_div_ss(_mm_set_ss(1.0f), det); // <--- yay, one original line not copied from Intel + det = _mm_shuffle_ps(det, det, 0x00); + // warning, Intel's variable naming is very confusing: now 'det' is 1/det ! + + // Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src. + minor0 = _mm_mul_ps(det, minor0); + float *dst = result.data(); + _mm_storel_pi((__m64*)(dst), minor0); + _mm_storeh_pi((__m64*)(dst+2), minor0); + minor1 = _mm_mul_ps(det, minor1); + _mm_storel_pi((__m64*)(dst+4), minor1); + _mm_storeh_pi((__m64*)(dst+6), minor1); + minor2 = _mm_mul_ps(det, minor2); + _mm_storel_pi((__m64*)(dst+ 8), minor2); + _mm_storeh_pi((__m64*)(dst+10), minor2); + minor3 = _mm_mul_ps(det, minor3); + _mm_storel_pi((__m64*)(dst+12), minor3); + _mm_storeh_pi((__m64*)(dst+14), minor3); + } +}; +#endif /*********************************************** *** Part 2 : selector and MatrixBase methods *** @@ -210,7 +264,7 @@ struct ei_compute_inverse { static inline void run(const MatrixType& matrix, MatrixType* result) { - ei_compute_inverse_in_size4_case(matrix, result); + ei_compute_inverse_in_size4_case::run(matrix, *result); } }; diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index 0494912a3..972f5c0ba 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -45,7 +45,6 @@ template void inverse_permutation_4x4() { typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - double error_max = 0.; Vector4i indices(0,1,2,3); for(int i = 0; i < 24; ++i) { @@ -56,12 +55,9 @@ template void inverse_permutation_4x4() m(indices(3),3) = 1; MatrixType inv = m.inverse(); double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon() ); - error_max = std::max(error_max, error); + VERIFY(error == 0.0); std::next_permutation(indices.data(),indices.data()+4); } - std::cerr << "inverse_permutation_4x4, Scalar = " << type_name() << std::endl; - EIGEN_DEBUG_VAR(error_max); - VERIFY(error_max < 1. ); } template void inverse_general_4x4(int repeat) @@ -86,8 +82,8 @@ template void inverse_general_4x4(int repeat) double error_avg = error_sum / repeat; EIGEN_DEBUG_VAR(error_avg); EIGEN_DEBUG_VAR(error_max); - VERIFY(error_avg < (NumTraits::IsComplex ? 8.4 : 1.4) ); - VERIFY(error_max < (NumTraits::IsComplex ? 160.0 : 75.) ); + VERIFY(error_avg < (NumTraits::IsComplex ? 8.0 : 1.0)); + VERIFY(error_max < (NumTraits::IsComplex ? 64.0 : 20.0)); } void test_prec_inverse_4x4()