mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
add SSE path for Matrix4f inverse, taken from Intel except that we do a kosher
division instead of RCPPS-followed-by-Newton-Raphson. The rationale for that is that elsewhere in Eigen we dont allow ourselves this approximation (which throws 2 bits of mantissa), so there's no reason we should allow it here.
This commit is contained in:
parent
d5f3b2dc94
commit
d02eccf584
@ -1,4 +1,4 @@
|
|||||||
GNU LESSER GENERAL PUBLIC LICENSE
|
GNU LESSER GENERAL PUBLIC LICENSE
|
||||||
Version 3, 29 June 2007
|
Version 3, 29 June 2007
|
||||||
|
|
||||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||||
|
1
Eigen/LU
1
Eigen/LU
@ -25,6 +25,7 @@ namespace Eigen {
|
|||||||
#include "src/LU/PartialPivLU.h"
|
#include "src/LU/PartialPivLU.h"
|
||||||
#include "src/LU/Determinant.h"
|
#include "src/LU/Determinant.h"
|
||||||
#include "src/LU/Inverse.h"
|
#include "src/LU/Inverse.h"
|
||||||
|
#include "src/LU/arch/Inverse_SSE.h"
|
||||||
|
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
|
||||||
|
@ -182,10 +182,10 @@ struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
|
|||||||
*** Size 4 implementation ***
|
*** Size 4 implementation ***
|
||||||
****************************/
|
****************************/
|
||||||
|
|
||||||
template<typename MatrixType, typename ResultType>
|
template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
|
||||||
struct ei_compute_inverse<MatrixType, ResultType, 4>
|
struct ei_compute_inverse_size4
|
||||||
{
|
{
|
||||||
static inline void run(const MatrixType& matrix, ResultType& result)
|
static void run(const MatrixType& matrix, ResultType& result)
|
||||||
{
|
{
|
||||||
result.coeffRef(0,0) = matrix.minor(0,0).determinant();
|
result.coeffRef(0,0) = matrix.minor(0,0).determinant();
|
||||||
result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
|
result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
|
||||||
@ -207,6 +207,13 @@ struct ei_compute_inverse<MatrixType, ResultType, 4>
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename MatrixType, typename ResultType>
|
||||||
|
struct ei_compute_inverse<MatrixType, ResultType, 4>
|
||||||
|
: ei_compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar,
|
||||||
|
MatrixType, ResultType>
|
||||||
|
{
|
||||||
|
};
|
||||||
|
|
||||||
template<typename MatrixType, typename ResultType>
|
template<typename MatrixType, typename ResultType>
|
||||||
struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
|
struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
|
||||||
{
|
{
|
||||||
@ -238,7 +245,8 @@ template<typename MatrixType>
|
|||||||
struct ei_inverse_impl : public ReturnByValue<ei_inverse_impl<MatrixType> >
|
struct ei_inverse_impl : public ReturnByValue<ei_inverse_impl<MatrixType> >
|
||||||
{
|
{
|
||||||
// for 2x2, it's worth giving a chance to avoid evaluating.
|
// for 2x2, it's worth giving a chance to avoid evaluating.
|
||||||
// for larger sizes, evaluating has negligible cost and limits code size.
|
// for larger sizes, evaluating has negligible cost, limits code size,
|
||||||
|
// and allows for vectorized paths.
|
||||||
typedef typename ei_meta_if<
|
typedef typename ei_meta_if<
|
||||||
MatrixType::RowsAtCompileTime == 2,
|
MatrixType::RowsAtCompileTime == 2,
|
||||||
typename ei_nested<MatrixType,2>::type,
|
typename ei_nested<MatrixType,2>::type,
|
||||||
@ -264,7 +272,7 @@ struct ei_inverse_impl : public ReturnByValue<ei_inverse_impl<MatrixType> >
|
|||||||
*
|
*
|
||||||
* \returns the matrix inverse of this matrix.
|
* \returns the matrix inverse of this matrix.
|
||||||
*
|
*
|
||||||
* For small fixed sizes up to 4x4, this method uses ad-hoc methods (cofactors up to 3x3, Euler's trick for 4x4).
|
* For small fixed sizes up to 4x4, this method uses cofactors.
|
||||||
* In the general case, this method uses class PartialPivLU.
|
* In the general case, this method uses class PartialPivLU.
|
||||||
*
|
*
|
||||||
* \note This matrix must be invertible, otherwise the result is undefined. If you need an
|
* \note This matrix must be invertible, otherwise the result is undefined. If you need an
|
||||||
|
6
Eigen/src/LU/arch/CMakeLists.txt
Normal file
6
Eigen/src/LU/arch/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
FILE(GLOB Eigen_LU_arch_SRCS "*.h")
|
||||||
|
|
||||||
|
INSTALL(FILES
|
||||||
|
${Eigen_LU_arch_SRCS}
|
||||||
|
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel
|
||||||
|
)
|
152
Eigen/src/LU/arch/Inverse_SSE.h
Normal file
152
Eigen/src/LU/arch/Inverse_SSE.h
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 1999 Intel Corporation
|
||||||
|
// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
// The SSE code for the 4x4 float matrix inverse in this file comes from the file
|
||||||
|
// ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
|
||||||
|
// See page ii of that document for legal stuff. Not being lawyers, we just assume
|
||||||
|
// here that if Intel makes this document publically available, with source code
|
||||||
|
// and detailed explanations, it's because they want their CPUs to be fed with
|
||||||
|
// good code, and therefore they presumably don't mind us using it in Eigen.
|
||||||
|
|
||||||
|
#ifndef EIGEN_INVERSE_SSE_H
|
||||||
|
#define EIGEN_INVERSE_SSE_H
|
||||||
|
|
||||||
|
template<typename MatrixType, typename ResultType>
|
||||||
|
struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
|
||||||
|
{
|
||||||
|
static void run(const MatrixType& matrix, ResultType& result)
|
||||||
|
{
|
||||||
|
// Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the
|
||||||
|
// lines of the inverted matrix.
|
||||||
|
__m128 minor0, minor1, minor2, minor3;
|
||||||
|
|
||||||
|
// Variables which will contain the lines of the reference matrix and, later (after the transposition),
|
||||||
|
// the columns of the original matrix.
|
||||||
|
__m128 row0, row1, row2, row3;
|
||||||
|
|
||||||
|
// Temporary variables and the variable that will contain the matrix determinant.
|
||||||
|
__m128 det, tmp1;
|
||||||
|
|
||||||
|
// Matrix transposition
|
||||||
|
const float *src = matrix.data();
|
||||||
|
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
|
||||||
|
row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
|
||||||
|
row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
|
||||||
|
row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
|
||||||
|
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
|
||||||
|
row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
|
||||||
|
row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
|
||||||
|
row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
|
||||||
|
|
||||||
|
|
||||||
|
// Cofactors calculation. Because in the process of cofactor computation some pairs in three-
|
||||||
|
// element products are repeated, it is not reasonable to load these pairs anew every time. The
|
||||||
|
// values in the registers with these pairs are formed using shuffle instruction. Cofactors are
|
||||||
|
// calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register).
|
||||||
|
|
||||||
|
tmp1 = _mm_mul_ps(row2, row3);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
minor0 = _mm_mul_ps(row1, tmp1);
|
||||||
|
minor1 = _mm_mul_ps(row0, tmp1);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
|
||||||
|
minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
|
||||||
|
minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
|
||||||
|
// -----------------------------------------------
|
||||||
|
tmp1 = _mm_mul_ps(row1, row2);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
|
||||||
|
minor3 = _mm_mul_ps(row0, tmp1);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
|
||||||
|
minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
|
||||||
|
minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
|
||||||
|
// -----------------------------------------------
|
||||||
|
tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
row2 = _mm_shuffle_ps(row2, row2, 0x4E);
|
||||||
|
minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
|
||||||
|
minor2 = _mm_mul_ps(row0, tmp1);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
|
||||||
|
minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
|
||||||
|
minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
|
||||||
|
// -----------------------------------------------
|
||||||
|
tmp1 = _mm_mul_ps(row0, row1);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
|
||||||
|
minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
|
||||||
|
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
|
||||||
|
// -----------------------------------------------
|
||||||
|
tmp1 = _mm_mul_ps(row0, row3);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
|
||||||
|
minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
|
||||||
|
minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
|
||||||
|
// -----------------------------------------------
|
||||||
|
tmp1 = _mm_mul_ps(row0, row2);
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
|
||||||
|
minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
|
||||||
|
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
|
||||||
|
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
|
||||||
|
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
|
||||||
|
minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
|
||||||
|
|
||||||
|
// Evaluation of determinant and its reciprocal value. In the original Intel document,
|
||||||
|
// 1/det was evaluated using a fast rcpps command with subsequent approximation using
|
||||||
|
// the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead,
|
||||||
|
// so as to not compromise precision at all.
|
||||||
|
det = _mm_mul_ps(row0, minor0);
|
||||||
|
det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
|
||||||
|
det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
|
||||||
|
// tmp1= _mm_rcp_ss(det);
|
||||||
|
// det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
|
||||||
|
det = _mm_div_ps(ei_pset1<float>(1.0f), det); // <--- yay, one original line not copied from Intel
|
||||||
|
det = _mm_shuffle_ps(det, det, 0x00);
|
||||||
|
// warning, Intel's variable naming is very confusing: now 'det' is 1/det !
|
||||||
|
|
||||||
|
// Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src.
|
||||||
|
minor0 = _mm_mul_ps(det, minor0);
|
||||||
|
float *dst = result.data();
|
||||||
|
_mm_storel_pi((__m64*)(dst), minor0);
|
||||||
|
_mm_storeh_pi((__m64*)(dst+2), minor0);
|
||||||
|
minor1 = _mm_mul_ps(det, minor1);
|
||||||
|
_mm_storel_pi((__m64*)(dst+4), minor1);
|
||||||
|
_mm_storeh_pi((__m64*)(dst+6), minor1);
|
||||||
|
minor2 = _mm_mul_ps(det, minor2);
|
||||||
|
_mm_storel_pi((__m64*)(dst+ 8), minor2);
|
||||||
|
_mm_storeh_pi((__m64*)(dst+10), minor2);
|
||||||
|
minor3 = _mm_mul_ps(det, minor3);
|
||||||
|
_mm_storel_pi((__m64*)(dst+12), minor3);
|
||||||
|
_mm_storeh_pi((__m64*)(dst+14), minor3);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // EIGEN_INVERSE_SSE_H
|
||||||
|
|
@ -37,12 +37,9 @@ template<typename MatrixType> void inverse_permutation_4x4()
|
|||||||
MatrixType m = PermutationMatrix<4>(indices);
|
MatrixType m = PermutationMatrix<4>(indices);
|
||||||
MatrixType inv = m.inverse();
|
MatrixType inv = m.inverse();
|
||||||
double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
|
double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
|
||||||
error_max = std::max(error_max, error);
|
VERIFY(error == 0.0);
|
||||||
std::next_permutation(indices.data(),indices.data()+4);
|
std::next_permutation(indices.data(),indices.data()+4);
|
||||||
}
|
}
|
||||||
std::cerr << "inverse_permutation_4x4, Scalar = " << type_name<Scalar>() << std::endl;
|
|
||||||
EIGEN_DEBUG_VAR(error_max);
|
|
||||||
VERIFY(error_max < 1. );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename MatrixType> void inverse_general_4x4(int repeat)
|
template<typename MatrixType> void inverse_general_4x4(int repeat)
|
||||||
@ -68,7 +65,7 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
|
|||||||
EIGEN_DEBUG_VAR(error_avg);
|
EIGEN_DEBUG_VAR(error_avg);
|
||||||
EIGEN_DEBUG_VAR(error_max);
|
EIGEN_DEBUG_VAR(error_max);
|
||||||
VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.0));
|
VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.0));
|
||||||
VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 16.0));
|
VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_prec_inverse_4x4()
|
void test_prec_inverse_4x4()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user