From 28cd28072657e8367c0c611c8f155b17947bfbf4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= <cantonios@google.com>
Date: Tue, 4 Oct 2022 16:05:49 +0000
Subject: [PATCH] Fix 4x4 inverse when compiling with -Ofast.

(cherry picked from commit 7d6a9925cc38842359750f3e06263e20b7635436)
---
 Eigen/src/LU/arch/InverseSize4.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index a232ffc0a..178bc3895 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -35,6 +35,13 @@
 #ifndef EIGEN_INVERSE_SIZE_4_H
 #define EIGEN_INVERSE_SIZE_4_H
 
+#ifdef EIGEN_COMP_GNUC
+// These routines requires bit manipulation of the sign, which is not compatible
+// with fastmath.
+#pragma GCC push_options
+#pragma GCC optimize ("no-fast-math")
+#endif
+
 namespace Eigen
 {
 namespace internal
@@ -143,8 +150,8 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType
     iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
     iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
 
-    const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
-    const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
+    EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
+    const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
     rd = pxor(rd, p4f_sign_PNNP);
     iA = pmul(iA, rd);
     iB = pmul(iB, rd);
@@ -326,10 +333,10 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
     iC1 = psub(pmul(B1, dC), iC1);
     iC2 = psub(pmul(B2, dC), iC2);
 
-    const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
-    const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
-    const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
-    const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
+    EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
+    EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
+    const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
     d1 = pxor(rd, sign_PN);
     d2 = pxor(rd, sign_NP);
 
@@ -348,4 +355,9 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
 #endif
 } // namespace internal
 } // namespace Eigen
+
+#ifdef EIGEN_COMP_GNUC
+#pragma GCC pop_options
+#endif
+
 #endif