Unify Inverse_SSE.h and Inverse_NEON.h into a single generic implementation using PacketMath.

This commit is contained in:
Guoqiang QI 2020-11-17 12:27:01 +00:00 committed by David Tellenbach
parent 8e9cc5b10a
commit 394f564055
8 changed files with 482 additions and 722 deletions

View File

@ -40,12 +40,8 @@
// Use the SSE optimized version whenever possible. At the moment the
// SSE version doesn't compile when AVX is enabled
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/LU/arch/Inverse_SSE.h"
#endif
#if defined EIGEN_VECTORIZE_NEON
#include "src/LU/arch/Inverse_NEON.h"
#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON
#include "src/LU/arch/InverseSize4.h"
#endif
#include "src/Core/util/ReenableStupidWarnings.h"

View File

@ -84,6 +84,53 @@ typedef uint64x2_t Packet2ul;
#endif // EIGEN_COMP_MSVC
// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
// to enable a shared implementation for fast inversion of matrices of size 4.
template<bool interleave>
EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask)
{
const float* a = reinterpret_cast<const float*>(&m);
const float* b = reinterpret_cast<const float*>(&n);
Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
return res;
}
template<>
EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask)
{
const float* a = reinterpret_cast<const float*>(&m);
const float* b = reinterpret_cast<const float*>(&n);
Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
return res;
}
EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
{
return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
}
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
{
return shuffle<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
}
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
{
return shuffle<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
}
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
{
return shuffle<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
}
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
{
return shuffle<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
}
#define vec4f_duplane(a, p) \
vdupq_lane_f32(vget_low_f32(a), p)
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@ -3525,6 +3572,32 @@ template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2
typedef float64x2_t Packet2d;
typedef float64x1_t Packet1d;
// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
// for fast inversion of matrices of size 4.
EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
{
const double* a = reinterpret_cast<const double*>(&m);
const double* b = reinterpret_cast<const double*>(&n);
Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
return res;
}
EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
{
return shuffle(a, b, mask);
}
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
{
return shuffle(a, b, 0);
}
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
{
return shuffle(a, b, 3);
}
#define vec2d_duplane(a, p) \
vdupq_laneq_f64(a, p)
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet2d type;

View File

@ -1401,6 +1401,14 @@ template <>
EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
return vreinterpretq_u64_f64(a);
}
template <>
EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
return vreinterpretq_f64_s32(a);
}
template <>
EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
return vreinterpretq_s32_f64(a);
}
#endif // EIGEN_ARCH_ARM64

View File

@ -52,22 +52,59 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
template<> struct is_arithmetic<Packet4i> { enum { value = true }; };
template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
template<int p, int q, int r, int s>
struct shuffle_mask{
enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
};
// TODO: change the implementation of all swizzle* ops from macro to template,
#define vec4f_swizzle1(v,p,q,r,s) \
(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))
Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
#define vec4i_swizzle1(v,p,q,r,s) \
(_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
#define vec2d_swizzle1(v,p,q) \
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1))))
Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
#define vec4f_swizzle2(a,b,p,q,r,s) \
(_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
#define vec4i_swizzle2(a,b,p,q,r,s) \
(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))))
Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
{
return Packet4f(_mm_movelh_ps(a,b));
}
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
{
return Packet4f(_mm_movehl_ps(a,b));
}
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
{
return Packet4f(_mm_unpacklo_ps(a,b));
}
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
{
return Packet4f(_mm_unpackhi_ps(a,b));
}
#define vec4f_duplane(a,p) \
vec4f_swizzle2(a,a,p,p,p,p)
#define vec2d_swizzle2(a,b,mask) \
Packet2d(_mm_shuffle_pd(a,b,mask))
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
{
return Packet2d(_mm_unpacklo_pd(a,b));
}
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
{
return Packet2d(_mm_unpackhi_pd(a,b));
}
#define vec2d_duplane(a,p) \
vec2d_swizzle2(a,a,(p<<1)|p)
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)

View File

@ -77,6 +77,14 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Pa
return _mm_castsi128_ps(a);
}
template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
return _mm_castsi128_pd(a);
}
template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
return _mm_castpd_si128(a);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#if 0

View File

@ -0,0 +1,348 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2001 Intel Corporation
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
// adjugate of M and determinant of M respectively. M# is computed block-wise
// using specific formulae. For proof, see:
// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
// Variable names are adopted from \src\LU\Inverse_SSE.h.
//
// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
// comes from the following Intel's library:
// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
//
// Here is the respective copyright and license statement:
//
// Copyright (c) 2001 Intel Corporation.
//
// Permition is granted to use, copy, distribute and prepare derivative works
// of this library for any purpose and without fee, provided, that the above
// copyright notice and this statement appear in all copies.
// Intel makes no representations about the suitability of this software for
// any purpose, and specifically disclaims all warranties.
// See LEGAL.TXT for all the legal information.
//
// TODO: Unify implementations of different data types (i.e. float and double).
#ifndef EIGEN_INVERSE_SIZE_4_H
#define EIGEN_INVERSE_SIZE_4_H
namespace Eigen
{
namespace internal
{
template <typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
{
enum
{
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
static void run(const MatrixType &mat, ResultType &result)
{
ActualMatrixType matrix(mat);
Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
// Four 2x2 sub-matrices of the input matrix
// input = [[A, B],
// [C, D]]
Packet4f A, B, C, D;
if (!StorageOrdersMatch)
{
A = vec4f_unpacklo(_L1, _L2);
B = vec4f_unpacklo(_L3, _L4);
C = vec4f_unpackhi(_L1, _L2);
D = vec4f_unpackhi(_L3, _L4);
}
else
{
A = vec4f_movelh(_L1, _L2);
B = vec4f_movehl(_L2, _L1);
C = vec4f_movelh(_L3, _L4);
D = vec4f_movehl(_L4, _L3);
}
Packet4f AB, DC;
// AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
// DC = D#*C
DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
// determinants of the sub-matrices
Packet4f dA, dB, dC, dD;
dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
dA = psub(dA, vec4f_movehl(dA, dA));
dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
dB = psub(dB, vec4f_movehl(dB, dB));
dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
dC = psub(dC, vec4f_movehl(dC, dC));
dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
dD = psub(dD, vec4f_movehl(dD, dD));
Packet4f d, d1, d2;
d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
d = padd(d, vec4f_movehl(d, d));
d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
d1 = pmul(dA, dD);
d2 = pmul(dB, dC);
// determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
// reciprocal of the determinant of the input matrix, rd = 1/det
Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
// Four sub-matrices of the inverse
Packet4f iA, iB, iC, iD;
// iD = D*|A| - C*A#*B
iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
// iA = A*|D| - B*D#*C
iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
// iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
// iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
const int bits[4] = {0, -2147483648, -2147483648, 0};
const Packet4f p4f_sign_PNNP = preinterpret<Packet4f, Packet4i>(pgather<int, Packet4i>(bits, static_cast<Eigen::Index>(1)));
rd = pxor(rd, p4f_sign_PNNP);
iA = pmul(iA, rd);
iB = pmul(iB, rd);
iC = pmul(iC, rd);
iD = pmul(iD, rd);
Index res_stride = result.outerStride();
float *res = result.data();
pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
}
};
#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
// same algorithm as above, except that each operand is split into
// halves for two registers to hold.
template <typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
{
enum
{
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
MatrixType const &,
typename MatrixType::PlainObject>::type
ActualMatrixType;
static void run(const MatrixType &mat, ResultType &result)
{
ActualMatrixType matrix(mat);
// Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
// row e.g. A1, upper row of A, A2, lower row of A
// input = [[A, B], = [[[A1, [B1,
// [C, D]] A2], B2]],
// [[C1, [D1,
// C2], D2]]]
Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
if (StorageOrdersMatch)
{
A1 = matrix.template packet<MatrixAlignment>(0);
B1 = matrix.template packet<MatrixAlignment>(2);
A2 = matrix.template packet<MatrixAlignment>(4);
B2 = matrix.template packet<MatrixAlignment>(6);
C1 = matrix.template packet<MatrixAlignment>(8);
D1 = matrix.template packet<MatrixAlignment>(10);
C2 = matrix.template packet<MatrixAlignment>(12);
D2 = matrix.template packet<MatrixAlignment>(14);
}
else
{
Packet2d temp;
A1 = matrix.template packet<MatrixAlignment>(0);
C1 = matrix.template packet<MatrixAlignment>(2);
A2 = matrix.template packet<MatrixAlignment>(4);
C2 = matrix.template packet<MatrixAlignment>(6);
temp = A1;
A1 = vec2d_unpacklo(A1, A2);
A2 = vec2d_unpackhi(temp, A2);
temp = C1;
C1 = vec2d_unpacklo(C1, C2);
C2 = vec2d_unpackhi(temp, C2);
B1 = matrix.template packet<MatrixAlignment>(8);
D1 = matrix.template packet<MatrixAlignment>(10);
B2 = matrix.template packet<MatrixAlignment>(12);
D2 = matrix.template packet<MatrixAlignment>(14);
temp = B1;
B1 = vec2d_unpacklo(B1, B2);
B2 = vec2d_unpackhi(temp, B2);
temp = D1;
D1 = vec2d_unpacklo(D1, D2);
D2 = vec2d_unpackhi(temp, D2);
}
// determinants of the sub-matrices
Packet2d dA, dB, dC, dD;
dA = vec2d_swizzle2(A2, A2, 1);
dA = pmul(A1, dA);
dA = psub(dA, vec2d_duplane(dA, 1));
dB = vec2d_swizzle2(B2, B2, 1);
dB = pmul(B1, dB);
dB = psub(dB, vec2d_duplane(dB, 1));
dC = vec2d_swizzle2(C2, C2, 1);
dC = pmul(C1, dC);
dC = psub(dC, vec2d_duplane(dC, 1));
dD = vec2d_swizzle2(D2, D2, 1);
dD = pmul(D1, dD);
dD = psub(dD, vec2d_duplane(dD, 1));
Packet2d DC1, DC2, AB1, AB2;
// AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
AB1 = pmul(B1, vec2d_duplane(A2, 1));
AB2 = pmul(B2, vec2d_duplane(A1, 0));
AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
// DC = D#*C
DC1 = pmul(C1, vec2d_duplane(D2, 1));
DC2 = pmul(C2, vec2d_duplane(D1, 0));
DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
Packet2d d1, d2;
// determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
Packet2d det;
// reciprocal of the determinant of the input matrix, rd = 1/det
Packet2d rd;
d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
rd = padd(d1, d2);
rd = padd(rd, vec2d_duplane(rd, 1));
d1 = pmul(dA, dD);
d2 = pmul(dB, dC);
det = padd(d1, d2);
det = psub(det, rd);
det = vec2d_duplane(det, 0);
rd = pdiv(pset1<Packet2d>(1.0), det);
// rows of four sub-matrices of the inverse
Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
// iD = D*|A| - C*A#*B
iD1 = pmul(AB1, vec2d_duplane(C1, 0));
iD2 = pmul(AB1, vec2d_duplane(C2, 0));
iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
dA = vec2d_duplane(dA, 0);
iD1 = psub(pmul(D1, dA), iD1);
iD2 = psub(pmul(D2, dA), iD2);
// iA = A*|D| - B*D#*C
iA1 = pmul(DC1, vec2d_duplane(B1, 0));
iA2 = pmul(DC1, vec2d_duplane(B2, 0));
iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
dD = vec2d_duplane(dD, 0);
iA1 = psub(pmul(A1, dD), iA1);
iA2 = psub(pmul(A2, dD), iA2);
// iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
dB = vec2d_duplane(dB, 0);
iB1 = psub(pmul(C1, dB), iB1);
iB2 = psub(pmul(C2, dB), iB2);
// iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
dC = vec2d_duplane(dC, 0);
iC1 = psub(pmul(B1, dC), iC1);
iC2 = psub(pmul(B2, dC), iC2);
const int bits1[4] = {0, -2147483648, 0, 0};
const int bits2[4] = {0, 0, 0, -2147483648};
const Packet2d _Sign_NP = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits1, static_cast<Eigen::Index>(1)));
const Packet2d _Sign_PN = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits2, static_cast<Eigen::Index>(1)));
d1 = pxor(rd, _Sign_PN);
d2 = pxor(rd, _Sign_NP);
Index res_stride = result.outerStride();
double *res = result.data();
pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
}
};
#endif
} // namespace internal
} // namespace Eigen
#endif

View File

@ -1,372 +0,0 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The algorithm below is a re-implementation of \src\LU\Inverse_SSE.h using NEON
// intrinsics. inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
// adjugate of M and determinant of M respectively. M# is computed block-wise
// using specific formulae. For proof, see:
// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
// Variable names are adopted from \src\LU\Inverse_SSE.h.
// TODO: Unify implementations of different data types (i.e. float and double) and
// different sets of instrinsics (i.e. SSE and NEON)
#ifndef EIGEN_INVERSE_NEON_H
#define EIGEN_INVERSE_NEON_H
namespace Eigen
{
namespace internal
{
template <typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::NEON, float, MatrixType, ResultType>
{
enum
{
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
// == false (i.e. shuffle(m, n, mask, false) equals _mm_shuffle_ps(m, n, mask)),
// interleave m and n when interleave == true
static Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask, bool interleave = false)
{
const float *a = reinterpret_cast<const float *>(&m);
const float *b = reinterpret_cast<const float *>(&n);
if (!interleave)
{
Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
return res;
}
else
{
Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
return res;
}
}
static void run(const MatrixType &mat, ResultType &result)
{
ActualMatrixType matrix(mat);
Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
// Four 2x2 sub-matrices of the input matrix
// input = [[A, B],
// [C, D]]
Packet4f A, B, C, D;
if (!StorageOrdersMatch)
{
A = shuffle(_L1, _L2, 0x50, true);
B = shuffle(_L3, _L4, 0x50, true);
C = shuffle(_L1, _L2, 0xFA, true);
D = shuffle(_L3, _L4, 0xFA, true);
}
else
{
A = shuffle(_L1, _L2, 0x44);
B = shuffle(_L1, _L2, 0xEE);
C = shuffle(_L3, _L4, 0x44);
D = shuffle(_L3, _L4, 0xEE);
}
Packet4f AB, DC, temp;
// AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
AB = shuffle(A, A, 0x0F);
AB = pmul(AB, B);
temp = shuffle(A, A, 0xA5);
temp = pmul(temp, shuffle(B, B, 0x4E));
AB = psub(AB, temp);
// DC = D#*C
DC = shuffle(D, D, 0x0F);
DC = pmul(DC, C);
temp = shuffle(D, D, 0xA5);
temp = pmul(temp, shuffle(C, C, 0x4E));
DC = psub(DC, temp);
// determinants of the sub-matrices
Packet4f dA, dB, dC, dD;
dA = pmul(shuffle(A, A, 0x5F), A);
dA = psub(dA, shuffle(dA, dA, 0xEE));
dB = pmul(shuffle(B, B, 0x5F), B);
dB = psub(dB, shuffle(dB, dB, 0xEE));
dC = pmul(shuffle(C, C, 0x5F), C);
dC = psub(dC, shuffle(dC, dC, 0xEE));
dD = pmul(shuffle(D, D, 0x5F), D);
dD = psub(dD, shuffle(dD, dD, 0xEE));
Packet4f d, d1, d2;
Packet2f sum;
temp = shuffle(DC, DC, 0xD8);
d = pmul(temp, AB);
sum = vpadd_f32(vadd_f32(vget_low_f32(d), vget_high_f32(d)), vadd_f32(vget_low_f32(d), vget_high_f32(d)));
d = vdupq_lane_f32(sum, 0);
d1 = pmul(dA, dD);
d2 = pmul(dB, dC);
// determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
Packet4f det = psub(padd(d1, d2), d);
// reciprocal of the determinant of the input matrix, rd = 1/det
Packet4f rd = pdiv(vdupq_n_f32(float32_t(1.0)), det);
// Four sub-matrices of the inverse
Packet4f iA, iB, iC, iD;
// iD = D*|A| - C*A#*B
temp = shuffle(C, C, 0xA0);
temp = pmul(temp, shuffle(AB, AB, 0x44));
iD = shuffle(C, C, 0xF5);
iD = pmul(iD, shuffle(AB, AB, 0xEE));
iD = padd(iD, temp);
iD = psub(vmulq_lane_f32(D, vget_low_f32(dA), 0), iD);
// iA = A*|D| - B*D#*C
temp = shuffle(B, B, 0xA0);
temp = pmul(temp, shuffle(DC, DC, 0x44));
iA = shuffle(B, B, 0xF5);
iA = pmul(iA, shuffle(DC, DC, 0xEE));
iA = padd(iA, temp);
iA = psub(vmulq_lane_f32(A, vget_low_f32(dD), 0), iA);
// iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
iB = pmul(D, shuffle(AB, AB, 0x33));
iB = psub(iB, pmul(shuffle(D, D, 0xB1), shuffle(AB, AB, 0x66)));
iB = psub(vmulq_lane_f32(C, vget_low_f32(dB), 0), iB);
// iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
iC = pmul(A, shuffle(DC, DC, 0x33));
iC = psub(iC, pmul(shuffle(A, A, 0xB1), shuffle(DC, DC, 0x66)));
iC = psub(vmulq_lane_f32(B, vget_low_f32(dC), 0), iC);
const Packet4f coeff = {1.0, -1.0, -1.0, 1.0};
rd = pmul(vdupq_lane_f32(vget_low_f32(rd), 0), coeff);
iA = pmul(iA, rd);
iB = pmul(iB, rd);
iC = pmul(iC, rd);
iD = pmul(iD, rd);
Index res_stride = result.outerStride();
float *res = result.data();
pstoret<float, Packet4f, ResultAlignment>(res + 0, shuffle(iA, iB, 0x77));
pstoret<float, Packet4f, ResultAlignment>(res + res_stride, shuffle(iA, iB, 0x22));
pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, shuffle(iC, iD, 0x77));
pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, shuffle(iC, iD, 0x22));
}
};
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// same algorithm as above, except that each operand is split into
// halves for two registers to hold.
template <typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::NEON, double, MatrixType, ResultType>
{
enum
{
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
MatrixType const &,
typename MatrixType::PlainObject>::type
ActualMatrixType;
// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
static Packet2d shuffle(const Packet2d &m, const Packet2d &n, int mask)
{
const double *a = reinterpret_cast<const double *>(&m);
const double *b = reinterpret_cast<const double *>(&n);
Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
return res;
}
static void run(const MatrixType &mat, ResultType &result)
{
ActualMatrixType matrix(mat);
// Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
// row e.g. A1, upper row of A, A2, lower row of A
// input = [[A, B], = [[[A1, [B1,
// [C, D]] A2], B2]],
// [[C1, [D1,
// C2], D2]]]
Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
if (StorageOrdersMatch)
{
A1 = matrix.template packet<MatrixAlignment>(0);
B1 = matrix.template packet<MatrixAlignment>(2);
A2 = matrix.template packet<MatrixAlignment>(4);
B2 = matrix.template packet<MatrixAlignment>(6);
C1 = matrix.template packet<MatrixAlignment>(8);
D1 = matrix.template packet<MatrixAlignment>(10);
C2 = matrix.template packet<MatrixAlignment>(12);
D2 = matrix.template packet<MatrixAlignment>(14);
}
else
{
Packet2d temp;
A1 = matrix.template packet<MatrixAlignment>(0);
C1 = matrix.template packet<MatrixAlignment>(2);
A2 = matrix.template packet<MatrixAlignment>(4);
C2 = matrix.template packet<MatrixAlignment>(6);
temp = A1;
A1 = shuffle(A1, A2, 0);
A2 = shuffle(temp, A2, 3);
temp = C1;
C1 = shuffle(C1, C2, 0);
C2 = shuffle(temp, C2, 3);
B1 = matrix.template packet<MatrixAlignment>(8);
D1 = matrix.template packet<MatrixAlignment>(10);
B2 = matrix.template packet<MatrixAlignment>(12);
D2 = matrix.template packet<MatrixAlignment>(14);
temp = B1;
B1 = shuffle(B1, B2, 0);
B2 = shuffle(temp, B2, 3);
temp = D1;
D1 = shuffle(D1, D2, 0);
D2 = shuffle(temp, D2, 3);
}
// determinants of the sub-matrices
Packet2d dA, dB, dC, dD;
dA = shuffle(A2, A2, 1);
dA = pmul(A1, dA);
dA = psub(dA, vdupq_laneq_f64(dA, 1));
dB = shuffle(B2, B2, 1);
dB = pmul(B1, dB);
dB = psub(dB, vdupq_laneq_f64(dB, 1));
dC = shuffle(C2, C2, 1);
dC = pmul(C1, dC);
dC = psub(dC, vdupq_laneq_f64(dC, 1));
dD = shuffle(D2, D2, 1);
dD = pmul(D1, dD);
dD = psub(dD, vdupq_laneq_f64(dD, 1));
Packet2d DC1, DC2, AB1, AB2;
// AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
AB1 = pmul(B1, vdupq_laneq_f64(A2, 1));
AB2 = pmul(B2, vdupq_laneq_f64(A1, 0));
AB1 = psub(AB1, pmul(B2, vdupq_laneq_f64(A1, 1)));
AB2 = psub(AB2, pmul(B1, vdupq_laneq_f64(A2, 0)));
// DC = D#*C
DC1 = pmul(C1, vdupq_laneq_f64(D2, 1));
DC2 = pmul(C2, vdupq_laneq_f64(D1, 0));
DC1 = psub(DC1, pmul(C2, vdupq_laneq_f64(D1, 1)));
DC2 = psub(DC2, pmul(C1, vdupq_laneq_f64(D2, 0)));
Packet2d d1, d2;
// determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
Packet2d det;
// reciprocal of the determinant of the input matrix, rd = 1/det
Packet2d rd;
d1 = pmul(AB1, shuffle(DC1, DC2, 0));
d2 = pmul(AB2, shuffle(DC1, DC2, 3));
rd = padd(d1, d2);
rd = padd(rd, vdupq_laneq_f64(rd, 1));
d1 = pmul(dA, dD);
d2 = pmul(dB, dC);
det = padd(d1, d2);
det = psub(det, rd);
det = vdupq_laneq_f64(det, 0);
rd = pdiv(vdupq_n_f64(float64_t(1.0)), det);
// rows of four sub-matrices of the inverse
Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
// iD = D*|A| - C*A#*B
iD1 = pmul(AB1, vdupq_laneq_f64(C1, 0));
iD2 = pmul(AB1, vdupq_laneq_f64(C2, 0));
iD1 = padd(iD1, pmul(AB2, vdupq_laneq_f64(C1, 1)));
iD2 = padd(iD2, pmul(AB2, vdupq_laneq_f64(C2, 1)));
dA = vdupq_laneq_f64(dA, 0);
iD1 = psub(pmul(D1, dA), iD1);
iD2 = psub(pmul(D2, dA), iD2);
// iA = A*|D| - B*D#*C
iA1 = pmul(DC1, vdupq_laneq_f64(B1, 0));
iA2 = pmul(DC1, vdupq_laneq_f64(B2, 0));
iA1 = padd(iA1, pmul(DC2, vdupq_laneq_f64(B1, 1)));
iA2 = padd(iA2, pmul(DC2, vdupq_laneq_f64(B2, 1)));
dD = vdupq_laneq_f64(dD, 0);
iA1 = psub(pmul(A1, dD), iA1);
iA2 = psub(pmul(A2, dD), iA2);
// iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
iB1 = pmul(D1, shuffle(AB2, AB1, 1));
iB2 = pmul(D2, shuffle(AB2, AB1, 1));
iB1 = psub(iB1, pmul(shuffle(D1, D1, 1), shuffle(AB2, AB1, 2)));
iB2 = psub(iB2, pmul(shuffle(D2, D2, 1), shuffle(AB2, AB1, 2)));
dB = vdupq_laneq_f64(dB, 0);
iB1 = psub(pmul(C1, dB), iB1);
iB2 = psub(pmul(C2, dB), iB2);
// iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
iC1 = pmul(A1, shuffle(DC2, DC1, 1));
iC2 = pmul(A2, shuffle(DC2, DC1, 1));
iC1 = psub(iC1, pmul(shuffle(A1, A1, 1), shuffle(DC2, DC1, 2)));
iC2 = psub(iC2, pmul(shuffle(A2, A2, 1), shuffle(DC2, DC1, 2)));
dC = vdupq_laneq_f64(dC, 0);
iC1 = psub(pmul(B1, dC), iC1);
iC2 = psub(pmul(B2, dC), iC2);
const Packet2d PN = {1.0, -1.0};
const Packet2d NP = {-1.0, 1.0};
d1 = pmul(PN, rd);
d2 = pmul(NP, rd);
Index res_stride = result.outerStride();
double *res = result.data();
pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(shuffle(iA2, iA1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(shuffle(iA2, iA1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(shuffle(iB2, iB1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(shuffle(iB2, iB1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(shuffle(iC2, iC1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(shuffle(iC2, iC1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(shuffle(iD2, iD1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(shuffle(iD2, iD1, 0), d2));
}
};
#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
} // namespace internal
} // namespace Eigen
#endif

View File

@ -1,338 +0,0 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2001 Intel Corporation
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// The SSE code for the 4x4 float and double matrix inverse in this file
// comes from the following Intel's library:
// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
//
// Here is the respective copyright and license statement:
//
// Copyright (c) 2001 Intel Corporation.
//
// Permition is granted to use, copy, distribute and prepare derivative works
// of this library for any purpose and without fee, provided, that the above
// copyright notice and this statement appear in all copies.
// Intel makes no representations about the suitability of this software for
// any purpose, and specifically disclaims all warranties.
// See LEGAL.TXT for all the legal information.
#ifndef EIGEN_INVERSE_SSE_H
#define EIGEN_INVERSE_SSE_H
namespace Eigen {
namespace internal {
template<typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
{
enum {
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
static void run(const MatrixType& mat, ResultType& result)
{
ActualMatrixType matrix(mat);
const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000));
// Load the full matrix into registers
__m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
__m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
__m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
__m128 _L4 = matrix.template packet<MatrixAlignment>(12);
// The inverse is calculated using "Divide and Conquer" technique. The
// original matrix is divide into four 2x2 sub-matrices. Since each
// register holds four matrix element, the smaller matrices are
// represented as a registers. Hence we get a better locality of the
// calculations.
__m128 A, B, C, D; // the four sub-matrices
if(!StorageOrdersMatch)
{
A = _mm_unpacklo_ps(_L1, _L2);
B = _mm_unpacklo_ps(_L3, _L4);
C = _mm_unpackhi_ps(_L1, _L2);
D = _mm_unpackhi_ps(_L3, _L4);
}
else
{
A = _mm_movelh_ps(_L1, _L2);
B = _mm_movehl_ps(_L2, _L1);
C = _mm_movelh_ps(_L3, _L4);
D = _mm_movehl_ps(_L4, _L3);
}
__m128 iA, iB, iC, iD, // partial inverse of the sub-matrices
DC, AB;
__m128 dA, dB, dC, dD; // determinant of the sub-matrices
__m128 det, d, d1, d2;
__m128 rd; // reciprocal of the determinant
// AB = A# * B
AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
// DC = D# * C
DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
// dA = |A|
dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
// dB = |B|
dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
// dC = |C|
dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
// dD = |D|
dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
// d = trace(AB*DC) = trace(A#*B*D#*C)
d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
// iD = C*A#*B
iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
// iA = B*D#*C
iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
// d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
d = _mm_add_ps(d, _mm_movehl_ps(d, d));
d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
d1 = _mm_mul_ss(dA,dD);
d2 = _mm_mul_ss(dB,dC);
// iD = D*|A| - C*A#*B
iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
// iA = A*|D| - B*D#*C;
iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
// det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
rd = _mm_div_ss(_mm_set_ss(1.0f), det);
// #ifdef ZERO_SINGULAR
// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
// #endif
// iB = D * (A#B)# = D*B#*A
iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
// iC = A * (D#C)# = A*C#*D
iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
rd = _mm_shuffle_ps(rd,rd,0);
rd = _mm_xor_ps(rd, p4f_sign_PNNP);
// iB = C*|B| - D*B#*A
iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
// iC = B*|C| - A*C#*D;
iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
// iX = iX / det
iA = _mm_mul_ps(rd,iA);
iB = _mm_mul_ps(rd,iB);
iC = _mm_mul_ps(rd,iC);
iD = _mm_mul_ps(rd,iD);
Index res_stride = result.outerStride();
float* res = result.data();
pstoret<float, Packet4f, ResultAlignment>(res+0, _mm_shuffle_ps(iA,iB,0x77));
pstoret<float, Packet4f, ResultAlignment>(res+res_stride, _mm_shuffle_ps(iA,iB,0x22));
pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
}
};
template<typename MatrixType, typename ResultType>
struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
{
enum {
MatrixAlignment = traits<MatrixType>::Alignment,
ResultAlignment = traits<ResultType>::Alignment,
StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
};
typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
static void run(const MatrixType& mat, ResultType& result)
{
ActualMatrixType matrix(mat);
const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
// The inverse is calculated using "Divide and Conquer" technique. The
// original matrix is divide into four 2x2 sub-matrices. Since each
// register of the matrix holds two elements, the smaller matrices are
// consisted of two registers. Hence we get a better locality of the
// calculations.
// the four sub-matrices
__m128d A1, A2, B1, B2, C1, C2, D1, D2;
if(StorageOrdersMatch)
{
A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
}
else
{
__m128d tmp;
A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
tmp = A1;
A1 = _mm_unpacklo_pd(A1,A2);
A2 = _mm_unpackhi_pd(tmp,A2);
tmp = C1;
C1 = _mm_unpacklo_pd(C1,C2);
C2 = _mm_unpackhi_pd(tmp,C2);
B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
tmp = B1;
B1 = _mm_unpacklo_pd(B1,B2);
B2 = _mm_unpackhi_pd(tmp,B2);
tmp = D1;
D1 = _mm_unpacklo_pd(D1,D2);
D2 = _mm_unpackhi_pd(tmp,D2);
}
__m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices
DC1, DC2, AB1, AB2;
__m128d dA, dB, dC, dD; // determinant of the sub-matrices
__m128d det, d1, d2, rd;
// dA = |A|
dA = _mm_shuffle_pd(A2, A2, 1);
dA = _mm_mul_pd(A1, dA);
dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
// dB = |B|
dB = _mm_shuffle_pd(B2, B2, 1);
dB = _mm_mul_pd(B1, dB);
dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
// AB = A# * B
AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
// dC = |C|
dC = _mm_shuffle_pd(C2, C2, 1);
dC = _mm_mul_pd(C1, dC);
dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
// dD = |D|
dD = _mm_shuffle_pd(D2, D2, 1);
dD = _mm_mul_pd(D1, dD);
dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
// DC = D# * C
DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
// rd = trace(AB*DC) = trace(A#*B*D#*C)
d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
rd = _mm_add_pd(d1, d2);
rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
// iD = C*A#*B
iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
// iA = B*D#*C
iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
// iD = D*|A| - C*A#*B
dA = _mm_shuffle_pd(dA,dA,0);
iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
// iA = A*|D| - B*D#*C;
dD = _mm_shuffle_pd(dD,dD,0);
iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
d1 = _mm_mul_sd(dA, dD);
d2 = _mm_mul_sd(dB, dC);
// iB = D * (A#B)# = D*B#*A
iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
// det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
det = _mm_add_sd(d1, d2);
det = _mm_sub_sd(det, rd);
// iC = A * (D#C)# = A*C#*D
iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
rd = _mm_div_sd(_mm_set_sd(1.0), det);
// #ifdef ZERO_SINGULAR
// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
// #endif
rd = _mm_shuffle_pd(rd,rd,0);
// iB = C*|B| - D*B#*A
dB = _mm_shuffle_pd(dB,dB,0);
iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
d1 = _mm_xor_pd(rd, _Sign_PN);
d2 = _mm_xor_pd(rd, _Sign_NP);
// iC = B*|C| - A*C#*D;
dC = _mm_shuffle_pd(dC,dC,0);
iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
Index res_stride = result.outerStride();
double* res = result.data();
pstoret<double, Packet2d, ResultAlignment>(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_INVERSE_SSE_H