mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-19 19:34:29 +08:00
backport 4x4 inverse changes:
- use cofactors - use Intel's SSE code in the float case
This commit is contained in:
parent
b581cb870c
commit
4262117f84
@ -75,93 +75,147 @@ void ei_compute_inverse_in_size3_case(const MatrixType& matrix, MatrixType* resu
|
||||
result->coeffRef(2, 2) = matrix.minor(2,2).determinant() * invdet;
|
||||
}
|
||||
|
||||
template<typename MatrixType>
|
||||
bool ei_compute_inverse_in_size4_case_helper(const MatrixType& matrix, MatrixType* result)
|
||||
template<typename MatrixType, typename Scalar = typename MatrixType::Scalar>
|
||||
struct ei_compute_inverse_in_size4_case
|
||||
{
|
||||
/* Let's split M into four 2x2 blocks:
|
||||
* (P Q)
|
||||
* (R S)
|
||||
* If P is invertible, with inverse denoted by P_inverse, and if
|
||||
* (S - R*P_inverse*Q) is also invertible, then the inverse of M is
|
||||
* (P' Q')
|
||||
* (R' S')
|
||||
* where
|
||||
* S' = (S - R*P_inverse*Q)^(-1)
|
||||
* P' = P1 + (P1*Q) * S' *(R*P_inverse)
|
||||
* Q' = -(P_inverse*Q) * S'
|
||||
* R' = -S' * (R*P_inverse)
|
||||
*/
|
||||
typedef Block<MatrixType,2,2> XprBlock22;
|
||||
typedef typename MatrixBase<XprBlock22>::PlainMatrixType Block22;
|
||||
Block22 P_inverse;
|
||||
ei_compute_inverse_in_size2_case(matrix.template block<2,2>(0,0), &P_inverse);
|
||||
const Block22 Q = matrix.template block<2,2>(0,2);
|
||||
const Block22 P_inverse_times_Q = P_inverse * Q;
|
||||
const XprBlock22 R = matrix.template block<2,2>(2,0);
|
||||
const Block22 R_times_P_inverse = R * P_inverse;
|
||||
const Block22 R_times_P_inverse_times_Q = R_times_P_inverse * Q;
|
||||
const XprBlock22 S = matrix.template block<2,2>(2,2);
|
||||
const Block22 X = S - R_times_P_inverse_times_Q;
|
||||
Block22 Y;
|
||||
ei_compute_inverse_in_size2_case(X, &Y);
|
||||
result->template block<2,2>(2,2) = Y;
|
||||
result->template block<2,2>(2,0) = - Y * R_times_P_inverse;
|
||||
const Block22 Z = P_inverse_times_Q * Y;
|
||||
result->template block<2,2>(0,2) = - Z;
|
||||
result->template block<2,2>(0,0) = P_inverse + Z * R_times_P_inverse;
|
||||
return true;
|
||||
}
|
||||
static void run(const MatrixType& matrix, MatrixType& result)
|
||||
{
|
||||
result.coeffRef(0,0) = matrix.minor(0,0).determinant();
|
||||
result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
|
||||
result.coeffRef(2,0) = matrix.minor(0,2).determinant();
|
||||
result.coeffRef(3,0) = -matrix.minor(0,3).determinant();
|
||||
result.coeffRef(0,2) = matrix.minor(2,0).determinant();
|
||||
result.coeffRef(1,2) = -matrix.minor(2,1).determinant();
|
||||
result.coeffRef(2,2) = matrix.minor(2,2).determinant();
|
||||
result.coeffRef(3,2) = -matrix.minor(2,3).determinant();
|
||||
result.coeffRef(0,1) = -matrix.minor(1,0).determinant();
|
||||
result.coeffRef(1,1) = matrix.minor(1,1).determinant();
|
||||
result.coeffRef(2,1) = -matrix.minor(1,2).determinant();
|
||||
result.coeffRef(3,1) = matrix.minor(1,3).determinant();
|
||||
result.coeffRef(0,3) = -matrix.minor(3,0).determinant();
|
||||
result.coeffRef(1,3) = matrix.minor(3,1).determinant();
|
||||
result.coeffRef(2,3) = -matrix.minor(3,2).determinant();
|
||||
result.coeffRef(3,3) = matrix.minor(3,3).determinant();
|
||||
result /= (matrix.col(0).cwise()*result.row(0).transpose()).sum();
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
template<typename MatrixType>
|
||||
void ei_compute_inverse_in_size4_case(const MatrixType& _matrix, MatrixType* result)
|
||||
struct ei_compute_inverse_in_size4_case<MatrixType, float>
|
||||
{
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
static void run(const MatrixType& matrix, MatrixType& result)
|
||||
{
|
||||
// Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the
|
||||
// lines of the inverted matrix.
|
||||
__m128 minor0, minor1, minor2, minor3;
|
||||
|
||||
// we will do row permutations on the matrix. This copy should have negligible cost.
|
||||
// if not, consider working in-place on the matrix (const-cast it, but then undo the permutations
|
||||
// to nevertheless honor constness)
|
||||
typename MatrixType::PlainMatrixType matrix(_matrix);
|
||||
// Variables which will contain the lines of the reference matrix and, later (after the transposition),
|
||||
// the columns of the original matrix.
|
||||
__m128 row0, row1, row2, row3;
|
||||
|
||||
// let's extract from the 2 first colums a 2x2 block whose determinant is as big as possible.
|
||||
int good_row0, good_row1, good_i;
|
||||
Matrix<RealScalar,6,1> absdet;
|
||||
// Temporary variables and the variable that will contain the matrix determinant.
|
||||
__m128 det, tmp1;
|
||||
|
||||
// any 2x2 block with determinant above this threshold will be considered good enough.
|
||||
// The magic value 1e-1 here comes from experimentation. The bigger it is, the higher the precision,
|
||||
// the slower the computation. This value 1e-1 gives precision almost as good as the brutal cofactors
|
||||
// algorithm, both in average and in worst-case precision.
|
||||
RealScalar d = (matrix.col(0).squaredNorm()+matrix.col(1).squaredNorm()) * RealScalar(1e-1);
|
||||
#define ei_inv_size4_helper_macro(i,row0,row1) \
|
||||
absdet[i] = ei_abs(matrix.coeff(row0,0)*matrix.coeff(row1,1) \
|
||||
- matrix.coeff(row0,1)*matrix.coeff(row1,0)); \
|
||||
if(absdet[i] > d) { good_row0=row0; good_row1=row1; goto good; }
|
||||
ei_inv_size4_helper_macro(0,0,1)
|
||||
ei_inv_size4_helper_macro(1,0,2)
|
||||
ei_inv_size4_helper_macro(2,0,3)
|
||||
ei_inv_size4_helper_macro(3,1,2)
|
||||
ei_inv_size4_helper_macro(4,1,3)
|
||||
ei_inv_size4_helper_macro(5,2,3)
|
||||
// Matrix transposition
|
||||
const float *src = matrix.data();
|
||||
tmp1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)src)), (__m64*)(src+ 4));
|
||||
row1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+8))), (__m64*)(src+12));
|
||||
row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
|
||||
row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
|
||||
tmp1 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+ 2))), (__m64*)(src+ 6));
|
||||
row3 = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+10))), (__m64*)(src+14));
|
||||
row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
|
||||
row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
|
||||
|
||||
// no 2x2 block has determinant bigger than the threshold. So just take the one that
|
||||
// has the biggest determinant
|
||||
absdet.maxCoeff(&good_i);
|
||||
good_row0 = good_i <= 2 ? 0 : good_i <= 4 ? 1 : 2;
|
||||
good_row1 = good_i <= 2 ? good_i+1 : good_i <= 4 ? good_i-1 : 3;
|
||||
|
||||
// now good_row0 and good_row1 are correctly set
|
||||
good:
|
||||
// Cofactors calculation. Because in the process of cofactor computation some pairs in three-
|
||||
// element products are repeated, it is not reasonable to load these pairs anew every time. The
|
||||
// values in the registers with these pairs are formed using shuffle instruction. Cofactors are
|
||||
// calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register).
|
||||
|
||||
// do row permutations to move this 2x2 block to the top
|
||||
matrix.row(0).swap(matrix.row(good_row0));
|
||||
matrix.row(1).swap(matrix.row(good_row1));
|
||||
// now applying our helper function is numerically stable
|
||||
ei_compute_inverse_in_size4_case_helper(matrix, result);
|
||||
// Since we did row permutations on the original matrix, we need to do column permutations
|
||||
// in the reverse order on the inverse
|
||||
result->col(1).swap(result->col(good_row1));
|
||||
result->col(0).swap(result->col(good_row0));
|
||||
}
|
||||
tmp1 = _mm_mul_ps(row2, row3);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
minor0 = _mm_mul_ps(row1, tmp1);
|
||||
minor1 = _mm_mul_ps(row0, tmp1);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
|
||||
minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
|
||||
minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
|
||||
// -----------------------------------------------
|
||||
tmp1 = _mm_mul_ps(row1, row2);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
|
||||
minor3 = _mm_mul_ps(row0, tmp1);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
|
||||
minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
|
||||
minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
|
||||
// -----------------------------------------------
|
||||
tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
row2 = _mm_shuffle_ps(row2, row2, 0x4E);
|
||||
minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
|
||||
minor2 = _mm_mul_ps(row0, tmp1);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
|
||||
minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
|
||||
minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
|
||||
// -----------------------------------------------
|
||||
tmp1 = _mm_mul_ps(row0, row1);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
|
||||
minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
|
||||
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
|
||||
// -----------------------------------------------
|
||||
tmp1 = _mm_mul_ps(row0, row3);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
|
||||
minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
|
||||
minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
|
||||
// -----------------------------------------------
|
||||
tmp1 = _mm_mul_ps(row0, row2);
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||
minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
|
||||
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
|
||||
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
|
||||
minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
|
||||
|
||||
// Evaluation of determinant and its reciprocal value. In the original Intel document,
|
||||
// 1/det was evaluated using a fast rcpps command with subsequent approximation using
|
||||
// the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead,
|
||||
// so as to not compromise precision at all.
|
||||
det = _mm_mul_ps(row0, minor0);
|
||||
det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
|
||||
det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
|
||||
// tmp1= _mm_rcp_ss(det);
|
||||
// det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
|
||||
det = _mm_div_ss(_mm_set_ss(1.0f), det); // <--- yay, one original line not copied from Intel
|
||||
det = _mm_shuffle_ps(det, det, 0x00);
|
||||
// warning, Intel's variable naming is very confusing: now 'det' is 1/det !
|
||||
|
||||
// Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src.
|
||||
minor0 = _mm_mul_ps(det, minor0);
|
||||
float *dst = result.data();
|
||||
_mm_storel_pi((__m64*)(dst), minor0);
|
||||
_mm_storeh_pi((__m64*)(dst+2), minor0);
|
||||
minor1 = _mm_mul_ps(det, minor1);
|
||||
_mm_storel_pi((__m64*)(dst+4), minor1);
|
||||
_mm_storeh_pi((__m64*)(dst+6), minor1);
|
||||
minor2 = _mm_mul_ps(det, minor2);
|
||||
_mm_storel_pi((__m64*)(dst+ 8), minor2);
|
||||
_mm_storeh_pi((__m64*)(dst+10), minor2);
|
||||
minor3 = _mm_mul_ps(det, minor3);
|
||||
_mm_storel_pi((__m64*)(dst+12), minor3);
|
||||
_mm_storeh_pi((__m64*)(dst+14), minor3);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
/***********************************************
|
||||
*** Part 2 : selector and MatrixBase methods ***
|
||||
@ -210,7 +264,7 @@ struct ei_compute_inverse<MatrixType, 4>
|
||||
{
|
||||
static inline void run(const MatrixType& matrix, MatrixType* result)
|
||||
{
|
||||
ei_compute_inverse_in_size4_case(matrix, result);
|
||||
ei_compute_inverse_in_size4_case<MatrixType>::run(matrix, *result);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -45,7 +45,6 @@ template<typename MatrixType> void inverse_permutation_4x4()
|
||||
{
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
double error_max = 0.;
|
||||
Vector4i indices(0,1,2,3);
|
||||
for(int i = 0; i < 24; ++i)
|
||||
{
|
||||
@ -56,12 +55,9 @@ template<typename MatrixType> void inverse_permutation_4x4()
|
||||
m(indices(3),3) = 1;
|
||||
MatrixType inv = m.inverse();
|
||||
double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
|
||||
error_max = std::max(error_max, error);
|
||||
VERIFY(error == 0.0);
|
||||
std::next_permutation(indices.data(),indices.data()+4);
|
||||
}
|
||||
std::cerr << "inverse_permutation_4x4, Scalar = " << type_name<Scalar>() << std::endl;
|
||||
EIGEN_DEBUG_VAR(error_max);
|
||||
VERIFY(error_max < 1. );
|
||||
}
|
||||
|
||||
template<typename MatrixType> void inverse_general_4x4(int repeat)
|
||||
@ -86,8 +82,8 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
|
||||
double error_avg = error_sum / repeat;
|
||||
EIGEN_DEBUG_VAR(error_avg);
|
||||
EIGEN_DEBUG_VAR(error_max);
|
||||
VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.4 : 1.4) );
|
||||
VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 160.0 : 75.) );
|
||||
VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.0));
|
||||
VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
|
||||
}
|
||||
|
||||
void test_prec_inverse_4x4()
|
||||
|
Loading…
x
Reference in New Issue
Block a user