Merge with master.

2025-10-11 23:51:50 +08:00 · 2025-07-10 12:22:28 -04:00 · 2025-07-10 12:22:28 -04:00 · 8328eec90d
commit 8328eec90d
parent dbdb94da81 bd0cd1d67b
81 changed files with 5855 additions and 2190 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -29,6 +29,11 @@ if (POLICY CMP0146)
  cmake_policy(SET CMP0146 OLD)
 endif ()

+# Normalize DESTINATION paths
+if (POLICY CMP0177)
+  cmake_policy(SET CMP0177 NEW)
+endif ()
+
 #==============================================================================
 # CMake Project.
 #==============================================================================
@ -254,7 +259,7 @@ if(EIGEN_BUILD_CMAKE_PACKAGE)
          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})

  # Add uninstall target
-  if(NOT TARGET uninstall)
+  if(NOT TARGET uninstall AND PROJECT_IS_TOP_LEVEL)
    add_custom_target ( uninstall
        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
  endif()
--- a/Eigen/Core
+++ b/Eigen/Core
@ -192,45 +192,38 @@ using std::ptrdiff_t;
 #include "src/Core/arch/Default/BFloat16.h"
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"

-#if defined EIGEN_VECTORIZE_AVX512
+#if defined EIGEN_VECTORIZE_SSE
 #include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_AVX
 #include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
+#include "src/Core/arch/AVX/Complex.h"
+#include "src/Core/arch/AVX/TypeCasting.h"
+#include "src/Core/arch/AVX/MathFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_AVX512
 #include "src/Core/arch/AVX512/PacketMath.h"
+#include "src/Core/arch/AVX512/Reductions.h"
+#include "src/Core/arch/AVX512/Complex.h"
+#include "src/Core/arch/AVX512/TypeCasting.h"
+#include "src/Core/arch/AVX512/MathFunctions.h"
+#include "src/Core/arch/AVX512/TrsmKernel.h"
+#endif
+
 #if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/PacketMathFP16.h"
-#endif
-#include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX512/TypeCasting.h"
-#if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/TypeCastingFP16.h"
-#endif
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/AVX512/Complex.h"
-#include "src/Core/arch/SSE/MathFunctions.h"
-#include "src/Core/arch/AVX/MathFunctions.h"
-#include "src/Core/arch/AVX512/MathFunctions.h"
-#if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/MathFunctionsFP16.h"
 #endif
-#include "src/Core/arch/AVX512/TrsmKernel.h"
-#elif defined EIGEN_VECTORIZE_AVX
-   // Use AVX for floats and doubles, SSE for integers
-#include "src/Core/arch/SSE/PacketMath.h"
-#include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/AVX/PacketMath.h"
-#include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/SSE/MathFunctions.h"
-#include "src/Core/arch/AVX/MathFunctions.h"
-#elif defined EIGEN_VECTORIZE_SSE
-#include "src/Core/arch/SSE/PacketMath.h"
-#include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/MathFunctions.h"
-#include "src/Core/arch/SSE/Complex.h"
-#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
 #include "src/Core/arch/AltiVec/PacketMath.h"
 #include "src/Core/arch/AltiVec/TypeCasting.h"
 #include "src/Core/arch/AltiVec/MathFunctions.h"
@ -358,6 +351,7 @@ using std::ptrdiff_t;
 #include "src/Core/SkewSymmetricMatrix3.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
+#include "src/Core/FindCoeff.h"
 #include "src/Core/Fuzzy.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -726,6 +726,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
                                                                                                   Index count) const {
    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
    Index offset = begin / SrcPacketSize;
    Index actualBegin = begin % SrcPacketSize;
    for (; offset < NumPackets; offset++) {
@ -743,6 +744,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
                                                                                                   Index count) const {
    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
    Index offset = begin / SrcPacketSize;
    Index actualBegin = begin % SrcPacketSize;
    for (; offset < NumPackets; offset++) {
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@ -45,10 +45,16 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
  // - This is the return type of the coeff() method.
  // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
  // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+  // - The DirectAccessBit means exactly that the underlying data of coefficients can be directly accessed as a plain
+  // strided array, which means exactly that the underlying data of coefficients does exist in memory, which means
+  // exactly that the coefficients is const-referencable, which means exactly that we can have coeff() return a const
+  // reference. For example, Map<const Matrix> have DirectAccessBit but not LvalueBit, so that Map<const Matrix>.coeff()
+  // does points to a const Scalar& which exists in memory, while does not allow coeffRef() as it would not provide a
+  // lvalue. Notice that DirectAccessBit and LvalueBit are mutually orthogonal.
  // - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems
  // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
  // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
-  typedef std::conditional_t<bool(internal::traits<Derived>::Flags& LvalueBit), const Scalar&,
+  typedef std::conditional_t<bool(internal::traits<Derived>::Flags&(LvalueBit | DirectAccessBit)), const Scalar&,
                             std::conditional_t<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>>
      CoeffReturnType;

--- a/Eigen/src/Core/FindCoeff.h
+++ b/Eigen/src/Core/FindCoeff.h
@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIND_COEFF_H
+#define EIGEN_FIND_COEFF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct max_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate > incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(incumbent, candidate);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate > incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct min_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate < incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(candidate, incumbent);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate < incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_max_traits {
+  static constexpr bool PacketAccess = packet_traits<Scalar>::Vectorizable;
+};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<min_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+
+template <typename Evaluator, typename Func, bool Linear, bool Vectorize>
+struct find_coeff_loop;
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& outer, Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    for (Index j = 0; j < outerSize; j++) {
+      for (Index i = 0; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        bool newRes = func.compareCoeff(res, xprCoeff);
+        if (newRes) {
+          outer = j;
+          inner = i;
+          res = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& index) {
+    Index size = eval.size();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    for (Index k = 0; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      bool newRes = func.compareCoeff(res, xprCoeff);
+      if (newRes) {
+        index = k;
+        res = xprCoeff;
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, false, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& outer,
+                                           Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+    Index packetEnd = numext::round_down(innerSize, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    bool checkPacket = false;
+
+    for (Index j = 0; j < outerSize; j++) {
+      Packet resultPacket = pset1<Packet>(result);
+      for (Index i = 0; i < packetEnd; i += PacketSize) {
+        Packet xprPacket = eval.template packetByOuterInner<Unaligned, Packet>(j, i);
+        if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+          outer = j;
+          inner = i;
+          result = func.predux(xprPacket);
+          resultPacket = pset1<Packet>(result);
+          checkPacket = true;
+        }
+      }
+
+      for (Index i = packetEnd; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          outer = j;
+          inner = i;
+          result = xprCoeff;
+          checkPacket = false;
+        }
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeffByOuterInner(outer, inner);
+      Index i_end = inner + PacketSize;
+      for (Index i = inner; i < i_end; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(outer, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          inner = i;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, true, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr int Alignment = Evaluator::Alignment;
+
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
+    Index size = eval.size();
+    Index packetEnd = numext::round_down(size, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    Packet resultPacket = pset1<Packet>(result);
+    bool checkPacket = false;
+
+    for (Index k = 0; k < packetEnd; k += PacketSize) {
+      Packet xprPacket = eval.template packet<Alignment, Packet>(k);
+      if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+        index = k;
+        result = func.predux(xprPacket);
+        resultPacket = pset1<Packet>(result);
+        checkPacket = true;
+      }
+    }
+
+    for (Index k = packetEnd; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      if (func.compareCoeff(result, xprCoeff)) {
+        index = k;
+        result = xprCoeff;
+        checkPacket = false;
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeff(index);
+      Index k_end = index + PacketSize;
+      for (Index k = index; k < k_end; k++) {
+        Scalar xprCoeff = eval.coeff(k);
+        if (func.compareCoeff(result, xprCoeff)) {
+          index = k;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+
+template <typename Derived>
+struct find_coeff_evaluator : public evaluator<Derived> {
+  using Base = evaluator<Derived>;
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int Flags = Base::Flags;
+  static constexpr bool IsRowMajor = bool(Flags & RowMajorBit);
+  EIGEN_DEVICE_FUNC inline find_coeff_evaluator(const Derived& xpr) : Base(xpr), m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar coeffByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::coeff(row, col);
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC inline PacketType packetByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::template packet<LoadMode, PacketType>(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC inline Index outerSize() const { return m_xpr.outerSize(); }
+  EIGEN_DEVICE_FUNC inline Index size() const { return m_xpr.size(); }
+
+  const Derived& m_xpr;
+};
+
+template <typename Derived, typename Func>
+struct find_coeff_impl {
+  using Evaluator = find_coeff_evaluator<Derived>;
+  static constexpr int Flags = Evaluator::Flags;
+  static constexpr int Alignment = Evaluator::Alignment;
+  static constexpr bool IsRowMajor = Derived::IsRowMajor;
+  static constexpr int MaxInnerSizeAtCompileTime =
+      IsRowMajor ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime;
+  static constexpr int MaxSizeAtCompileTime = Derived::MaxSizeAtCompileTime;
+
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename Evaluator::Packet;
+
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr bool Linearize = bool(Flags & LinearAccessBit);
+  static constexpr bool DontVectorize =
+      enum_lt_not_dynamic(Linearize ? MaxSizeAtCompileTime : MaxInnerSizeAtCompileTime, PacketSize);
+  static constexpr bool Vectorize =
+      !DontVectorize && bool(Flags & PacketAccessBit) && functor_traits<Func>::PacketAccess;
+
+  using Loop = find_coeff_loop<Evaluator, Func, Linearize, Vectorize>;
+
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<!ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, outer, inner);
+  }
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    // where possible, use the linear loop and back-calculate the outer and inner indices
+    Index index = 0;
+    run(xpr, func, res, index);
+    outer = index / xpr.innerSize();
+    inner = index % xpr.innerSize();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& index) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, index);
+  }
+};
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* rowPtr, IndexType* colPtr) {
+  eigen_assert(mat.rows() > 0 && mat.cols() > 0 && "you are using an empty matrix");
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index outer = 0;
+  Index inner = 0;
+  Scalar res = mat.coeff(0, 0);
+  FindCoeffImpl::run(mat.derived(), func, res, outer, inner);
+  *rowPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? outer : inner);
+  if (colPtr) *colPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? inner : outer);
+  return res;
+}
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* indexPtr) {
+  eigen_assert(mat.size() > 0 && "you are using an empty matrix");
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index index = 0;
+  Scalar res = mat.coeff(0);
+  FindCoeffImpl::run(mat.derived(), func, res, index);
+  *indexPtr = internal::convert_index<IndexType>(index);
+  return res;
+}
+
+}  // namespace internal
+
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the minimum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
+ * DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* indexPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
+ * DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* indexPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIND_COEFF_H
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -375,7 +375,7 @@ EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
  return a && b;
 }

-// In the generic case, memset to all one bits.
+// In the generic packet case, memset to all one bits.
 template <typename Packet, typename EnableIf = void>
 struct ptrue_impl {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@ -385,19 +385,16 @@ struct ptrue_impl {
  }
 };

+// Use a value of one for scalars.
+template <typename Scalar>
+struct ptrue_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&) { return Scalar(1); }
+};
+
 // For booleans, we can only directly set a valid `bool` value to avoid UB.
 template <>
 struct ptrue_impl<bool, void> {
-  static EIGEN_DEVICE_FUNC inline bool run(const bool& /*a*/) { return true; }
-};
-
-// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
-// Although this is technically not a valid bitmask, the scalar path for pselect
-// uses a comparison to zero, so this should still work in most cases. We don't
-// have another option, since the scalar type requires initialization.
-template <typename T>
-struct ptrue_impl<T, std::enable_if_t<is_scalar<T>::value && NumTraits<T>::RequireInitialization>> {
-  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { return T(1); }
+  static EIGEN_DEVICE_FUNC inline bool run(const bool&) { return true; }
 };

 /** \internal \returns one bits. */
@ -406,7 +403,7 @@ EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& a) {
  return ptrue_impl<Packet>::run(a);
 }

-// In the general case, memset to zero.
+// In the general packet case, memset to zero.
 template <typename Packet, typename EnableIf = void>
 struct pzero_impl {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@ -608,7 +605,7 @@ EIGEN_DEVICE_FUNC inline bool pselect<bool>(const bool& cond, const bool& a, con

 /** \internal \returns the min or of \a a and \a b (coeff-wise)
    If either \a a or \a b are NaN, the result is implementation defined. */
-template <int NaNPropagation>
+template <int NaNPropagation, bool IsInteger>
 struct pminmax_impl {
  template <typename Packet, typename Op>
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
@ -619,7 +616,7 @@ struct pminmax_impl {
 /** \internal \returns the min or max of \a a and \a b (coeff-wise)
    If either \a a or \a b are NaN, NaN is returned. */
 template <>
-struct pminmax_impl<PropagateNaN> {
+struct pminmax_impl<PropagateNaN, false> {
  template <typename Packet, typename Op>
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
    Packet not_nan_mask_a = pcmp_eq(a, a);
@ -632,7 +629,7 @@ struct pminmax_impl<PropagateNaN> {
    If both \a a and \a b are NaN, NaN is returned.
    Equivalent to std::fmin(a, b).  */
 template <>
-struct pminmax_impl<PropagateNumbers> {
+struct pminmax_impl<PropagateNumbers, false> {
  template <typename Packet, typename Op>
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
    Packet not_nan_mask_a = pcmp_eq(a, a);
@ -654,7 +651,8 @@ EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
    NaNPropagation determines the NaN propagation semantics. */
 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
-  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
 }

 /** \internal \returns the max of \a a and \a b  (coeff-wise)
@ -668,7 +666,8 @@ EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
    NaNPropagation determines the NaN propagation semantics. */
 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
-  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
 }

 /** \internal \returns the absolute value of \a a */
@ -873,17 +872,29 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_trait
  return a;
 }

+template <typename Packet, typename EnableIf = void>
+struct peven_mask_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet&) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    const size_t n = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+    for (size_t i = 0; i < n; ++i) {
+      memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+    }
+    return ploadu<Packet>(elements);
+  }
+};
+
+template <typename Scalar>
+struct peven_mask_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar&) { return Scalar(1); }
+};
+
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
     where x is the value of all 1-bits. */
 template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& /*a*/) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  const size_t n = unpacket_traits<Packet>::size;
-  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
-  for (size_t i = 0; i < n; ++i) {
-    memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
-  }
-  return ploadu<Packet>(elements);
+EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& a) {
+  return peven_mask_impl<Packet>::run(a);
 }

 /** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
@ -1244,26 +1255,46 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<Scalar>)));
 }

-template <int NaNPropagation, typename Packet>
-EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
-}
-
-/** \internal \returns the min of the elements of \a a */
+/** \internal \returns the max of the elements of \a a */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+struct predux_min_max_helper_impl {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static constexpr bool UsePredux_ = NaNPropagation == PropagateFast || NumTraits<Scalar>::IsInteger;
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_min(a);
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_max(a);
+  }
+};
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_min(a);
 }

 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_max(a);
 }

 #undef EIGEN_BINARY_OP_NAN_PROPAGATION
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -182,10 +182,6 @@ struct imag_ref_retval {
  typedef typename NumTraits<Scalar>::Real& type;
 };

-// implementation in MathFunctionsImpl.h
-template <typename Mask, bool is_built_in_float = std::is_floating_point<Mask>::value>
-struct scalar_select_mask;
-
 }  // namespace internal

 namespace numext {
@ -211,9 +207,9 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar&
  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }

-template <typename Scalar, typename Mask>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Mask& mask, const Scalar& a, const Scalar& b) {
-  return internal::scalar_select_mask<Mask>::run(mask) ? b : a;
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Scalar& mask, const Scalar& a, const Scalar& b) {
+  return numext::is_exactly_zero(mask) ? b : a;
 }

 }  // namespace numext
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -256,48 +256,6 @@ EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
  return ComplexT(numext::log(a), b);
 }

-// For generic scalars, use ternary select.
-template <typename Mask>
-struct scalar_select_mask<Mask, /*is_built_in_float*/ false> {
-  static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { return numext::is_exactly_zero(mask); }
-};
-
-// For built-in float mask, bitcast the mask to its integer counterpart and use ternary select.
-template <typename Mask>
-struct scalar_select_mask<Mask, /*is_built_in_float*/ true> {
-  using IntegerType = typename numext::get_integer_by_size<sizeof(Mask)>::unsigned_type;
-  static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) {
-    return numext::is_exactly_zero(numext::bit_cast<IntegerType>(std::abs(mask)));
-  }
-};
-
-template <int Size = sizeof(long double)>
-struct ldbl_select_mask {
-  static constexpr int MantissaDigits = std::numeric_limits<long double>::digits;
-  static constexpr int NumBytes = (MantissaDigits == 64 ? 80 : 128) / CHAR_BIT;
-  static EIGEN_DEVICE_FUNC inline bool run(const long double& mask) {
-    const uint8_t* mask_bytes = reinterpret_cast<const uint8_t*>(&mask);
-    for (Index i = 0; i < NumBytes; i++) {
-      if (mask_bytes[i] != 0) return false;
-    }
-    return true;
-  }
-};
-
-template <>
-struct ldbl_select_mask<sizeof(double)> : scalar_select_mask<double> {};
-
-template <>
-struct scalar_select_mask<long double, true> : ldbl_select_mask<> {};
-
-template <typename RealMask>
-struct scalar_select_mask<std::complex<RealMask>, false> {
-  using impl = scalar_select_mask<RealMask>;
-  static EIGEN_DEVICE_FUNC inline bool run(const std::complex<RealMask>& mask) {
-    return impl::run(numext::real(mask)) && impl::run(numext::imag(mask));
-  }
-};
-
 }  // end namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -851,7 +851,7 @@ struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>

  template <typename Dest>
  static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::IsVectorAtCompileTime>::run(
+    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
        dst, lhs.nestedExpression(), rhs, alpha);
  }
 };
@ -863,7 +863,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>

  template <typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<Lhs, 0, Lhs::IsVectorAtCompileTime, typename Rhs::MatrixType, Rhs::Mode, false>::run(
+    selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
        dst, lhs, rhs.nestedExpression(), alpha);
  }
 };
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@ -78,6 +78,14 @@ class SolverBase : public EigenBase<Derived> {
  template <typename Derived_>
  friend struct internal::solve_assertion;

+  ComputationInfo info() const {
+    // CRTP static dispatch: Calls the 'info()' method on the derived class.
+    // Derived must implement 'ComputationInfo info() const'.
+    // If not implemented, name lookup falls back to this base method, causing
+    // infinite recursion (detectable by -Winfinite-recursion).
+    return derived().info();
+  }
+
  enum {
    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -603,10 +603,9 @@ class VectorwiseOp {
  /** Returns the expression where each subvector is the product of the vector \a other
   * by the corresponding subvector of \c *this */
  template <typename OtherDerived>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-      CwiseBinaryOp<internal::scalar_product_op<Scalar>, const ExpressionTypeNestedCleaned,
-                    const typename ExtendedType<OtherDerived>::Type> EIGEN_DEVICE_FUNC
-      operator*(const DenseBase<OtherDerived>& other) const {
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_product_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
+  operator*(const DenseBase<OtherDerived>& other) const {
    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
@ -616,8 +615,8 @@ class VectorwiseOp {
  /** Returns the expression where each subvector is the quotient of the corresponding
   * subvector of \c *this by the vector \a other */
  template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
-                                  const typename ExtendedType<OtherDerived>::Type>
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
  operator/(const DenseBase<OtherDerived>& other) const {
    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@ -384,173 +384,6 @@ EIGEN_DEVICE_FUNC void DenseBase<Derived>::visit(Visitor& visitor) const {

 namespace internal {

-/** \internal
- * \brief Base class to implement min and max visitors
- */
-template <typename Derived>
-struct coeff_visitor {
-  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
-  EIGEN_DEVICE_FUNC coeff_visitor() : row(-1), col(-1), res(0) {}
-  typedef typename Derived::Scalar Scalar;
-  Index row, col;
-  Scalar res;
-  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index i, Index j) {
-    res = value;
-    row = i;
-    col = j;
-  }
-};
-
-template <typename Scalar, int NaNPropagation, bool is_min = true>
-struct minmax_compare {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a < b; }
-  static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_min<NaNPropagation>(p); }
-};
-
-template <typename Scalar, int NaNPropagation>
-struct minmax_compare<Scalar, NaNPropagation, false> {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a > b; }
-  static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_max<NaNPropagation>(p); }
-};
-
-// Default implementation used by non-floating types, where we do not
-// need special logic for NaN handling.
-template <typename Derived, bool is_min, int NaNPropagation,
-          bool isInt = NumTraits<typename Derived::Scalar>::IsInteger>
-struct minmax_coeff_visitor : coeff_visitor<Derived> {
-  using Scalar = typename Derived::Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, NaNPropagation, is_min>;
-  static constexpr Index PacketSize = packet_traits<Scalar>::size;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    if (Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    Scalar value = Comparator::predux(p);
-    if (Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      Packet mask = pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    Scalar value = Comparator::predux(p);
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    Packet mask = pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-// Suppress NaN. The only case in which we return NaN is if the matrix is all NaN,
-// in which case, row=0, col=0 is returned for the location.
-template <typename Derived, bool is_min>
-struct minmax_coeff_visitor<Derived, is_min, PropagateNumbers, false> : coeff_visitor<Derived> {
-  typedef typename Derived::Scalar Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, PropagateNumbers, is_min>;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      /* mask will be zero for NaNs, so they will be ignored. */
-      Packet mask = pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    if ((numext::isnan)(value)) {
-      this->res = value;
-      this->row = 0;
-      this->col = 0;
-      return;
-    }
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    /* mask will be zero for NaNs, so they will be ignored. */
-    Packet mask = pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-// Propagate NaNs. If the matrix contains NaN, the location of the first NaN
-// will be returned in row and col.
-template <typename Derived, bool is_min, int NaNPropagation>
-struct minmax_coeff_visitor<Derived, is_min, NaNPropagation, false> : coeff_visitor<Derived> {
-  typedef typename Derived::Scalar Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, PropagateNaN, is_min>;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    const bool value_is_nan = (numext::isnan)(value);
-    if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    const bool value_is_nan = (numext::isnan)(value);
-    if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      // If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
-      Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    const bool value_is_nan = (numext::isnan)(value);
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    // If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
-    Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-template <typename Derived, bool is_min, int NaNPropagation>
-struct functor_traits<minmax_coeff_visitor<Derived, is_min, NaNPropagation>> {
-  using Scalar = typename Derived::Scalar;
-  enum { Cost = NumTraits<Scalar>::AddCost, LinearAccess = false, PacketAccess = packet_traits<Scalar>::HasCmp };
-};
-
 template <typename Scalar>
 struct all_visitor {
  using result_type = bool;
@ -643,100 +476,6 @@ struct all_finite_impl<Derived, false> {

 }  // end namespace internal

-/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
- * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowId,
-                                                                                          IndexType* colId) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
-  this->visit(minVisitor);
-  *rowId = minVisitor.row;
-  if (colId) *colId = minVisitor.col;
-  return minVisitor.res;
-}
-
-/** \returns the minimum of all coefficients of *this and puts in *index its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
- * DenseBase::minCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* index) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-
-  internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
-  this->visit(minVisitor);
-  *index = IndexType((RowsAtCompileTime == 1) ? minVisitor.col : minVisitor.row);
-  return minVisitor.res;
-}
-
-/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
- * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
-                                                                                          IndexType* colPtr) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
-  this->visit(maxVisitor);
-  *rowPtr = maxVisitor.row;
-  if (colPtr) *colPtr = maxVisitor.col;
-  return maxVisitor.res;
-}
-
-/** \returns the maximum of all coefficients of *this and puts in *index its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
- * DenseBase::maxCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* index) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
-  this->visit(maxVisitor);
-  *index = (RowsAtCompileTime == 1) ? maxVisitor.col : maxVisitor.row;
-  return maxVisitor.res;
-}
-
 /** \returns true if all coefficients are true
 *
 * Example: \include MatrixBase_all.cpp
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -654,25 +654,6 @@ template <>
 EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
 }
-template <>
-EIGEN_STRONG_INLINE int64_t predux<Packet4l>(const Packet4l& a) {
-  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-  return _mm_extract_epi64_0(r) + _mm_extract_epi64_1(r);
-}
-template <>
-EIGEN_STRONG_INLINE uint64_t predux<Packet4ul>(const Packet4ul& a) {
-  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-  return numext::bit_cast<uint64_t>(_mm_extract_epi64_0(r) + _mm_extract_epi64_1(r));
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
-  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
-}
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
-  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
-}

 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
@ -1955,23 +1936,6 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Pack
  return pmul(a, c);  // a * 2^e
 }

-template <>
-EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
-  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) {
-  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a) {
-  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
-  return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
  return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
@ -1985,82 +1949,6 @@ EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a)
  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }

-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) {
-  Packet8f tmp;
-  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) {
-  Packet4d tmp;
-  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) {
-  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) {
-  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) {
-  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) {
-  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
-// {
-//   return _mm256_movemask_ps(x)==0xFF;
-// }
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) {
-  return _mm256_movemask_ps(x) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4d& x) {
-  return _mm256_movemask_pd(x) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) {
-  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
-}
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) {
-  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
-}
-
-#ifndef EIGEN_VECTORIZE_AVX512FP16
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) {
-  return _mm_movemask_epi8(x) != 0;
-}
-#endif  // EIGEN_VECTORIZE_AVX512FP16
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) {
-  return _mm_movemask_epi8(x) != 0;
-}
-
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
@ -2361,24 +2249,64 @@ EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
  return float2half(ptrunc<Packet8f>(half2float(a)));
 }

+template <>
+EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+// convert the sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
+  return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
+}
+
+// return true if both `a` and `b` are not NaN
+EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  __m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
+  __m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
+  // check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
+  // SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
+  return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isEqual);
 }

 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_andnot_si128(isGreater, isOrdered);
 }

 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isLess);
 }

 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+  __m128i isUnordered = por(pisnan(a), pisnan(b));
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_or_si128(isUnordered, isLess);
 }

 template <>
@ -2473,34 +2401,6 @@ EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const
  to[stride * 7] = aux[7];
 }

-template <>
-EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@ -2859,26 +2759,6 @@ EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packe
  to[stride * 7] = aux[7];
 }

-template <>
-EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
--- a/Eigen/src/Core/arch/AVX/Reductions.h
+++ b/Eigen/src/Core/arch/AVX/Reductions.h
@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX_H
+#define EIGEN_REDUCTIONS_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) {
+  Packet2l lo = _mm256_castsi256_si128(a);
+  Packet2l hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ul -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) {
+  return static_cast<uint64_t>(predux(Packet4l(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+  return _mm256_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4d& a) {
+  return _mm256_movemask_pd(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8h -- -- -- -- -- -- -- -- -- -- -- -- */
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet8h& a) {
+  return static_cast<half>(predux(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet8h& a) {
+  return static_cast<half>(predux_mul(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) {
+  return static_cast<half>(predux_min(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) {
+  return static_cast<half>(predux_max(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8h& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX_H
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -1494,40 +1494,6 @@ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d&
  OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
 #endif

-template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
-  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f x = _mm256_add_ps(lane0, lane1);
-  return predux<Packet8f>(x);
-#else
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
-  return predux<Packet4f>(sum);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d sum = _mm256_add_pd(lane0, lane1);
-  return predux<Packet4d>(sum);
-}
-
-template <>
-EIGEN_STRONG_INLINE int64_t predux<Packet8l>(const Packet8l& a) {
-  return _mm512_reduce_add_epi64(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE int predux<Packet16i>(const Packet16i& a) {
-  return _mm512_reduce_add_epi32(a);
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@ -1574,136 +1540,6 @@ EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
  return _mm256_add_epi64(lane0, lane1);
 }

-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-// #ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#else
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_mul<Packet16i>(const Packet16i& a) {
-  return _mm512_reduce_mul_epi32(a);
-}
-
-#if EIGEN_COMP_MSVC
-// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
-//    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
-//    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
-// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
-// Fall back to a manual approach:
-template <>
-EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
-  Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
-  Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
-  Packet4l res = pmul(lane0, lane1);
-  res = pmul(res, Packet4l(_mm256_permute2x128_si256(res, res, 1)));
-  res = pmul(res, Packet4l(_mm256_shuffle_epi32(res, 0xE)));
-  return pfirst(res);
-}
-#else
-template <>
-EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
-  return _mm512_reduce_mul_epi64(a);
-}
-#endif
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
-  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = _mm256_min_pd(lane0, lane1);
-  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_min<Packet16i>(const Packet16i& a) {
-  return _mm512_reduce_min_epi32(a);
-}
-template <>
-EIGEN_STRONG_INLINE int64_t predux_min<Packet8l>(const Packet8l& a) {
-  return _mm512_reduce_min_epi64(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
-  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = _mm256_max_pd(lane0, lane1);
-  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_max<Packet16i>(const Packet16i& a) {
-  return _mm512_reduce_max_epi32(a);
-}
-template <>
-EIGEN_STRONG_INLINE int64_t predux_max<Packet8l>(const Packet8l& a) {
-  return _mm512_reduce_max_epi64(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
-  return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
-  return _mm512_reduce_or_epi32(a) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
-  return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
-  return _mm512_reduce_or_epi64(a) != 0;
-}
-
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);

@ -2466,12 +2302,6 @@ EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet
  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
 }

-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux(from_float));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
@ -2479,26 +2309,6 @@ EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
  return padd<Packet8h>(lane0, lane1);
 }

-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
-  Packet16f af = half2float(a);
-  float reduced = predux_max<Packet16f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
-  Packet16f af = half2float(a);
-  float reduced = predux_min<Packet16f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux_mul(from_float));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@ -3005,26 +2815,6 @@ EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a
  return padd<Packet8bf>(lane0, lane1);
 }

-template <>
-EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
-  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
  __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
--- a/Eigen/src/Core/arch/AVX512/Reductions.h
+++ b/Eigen/src/Core/arch/AVX512/Reductions.h
@ -0,0 +1,297 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX512_H
+#define EIGEN_REDUCTIONS_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet16i& a) {
+  return _mm512_reduce_add_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) {
+  return _mm512_reduce_mul_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) {
+  return _mm512_reduce_min_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) {
+  return _mm512_reduce_max_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
+  return _mm512_reduce_or_epi32(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) {
+  return _mm512_reduce_add_epi64(a);
+}
+
+#if EIGEN_COMP_MSVC
+// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
+//    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
+//    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
+// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
+// Fall back to a manual approach:
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {
+  Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
+  Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
+  return predux_mul(pmul(lane0, lane1));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
+  return _mm512_reduce_mul_epi64(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) {
+  return _mm512_reduce_min_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) {
+  return _mm512_reduce_max_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
+  return _mm512_reduce_or_epi64(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet16f& a) {
+  return _mm512_reduce_add_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) {
+  return _mm512_reduce_mul_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+  return _mm512_reduce_min_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+  return _mm512_reduce_max_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+  return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet8d& a) {
+  return _mm512_reduce_add_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) {
+  return _mm512_reduce_mul_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) {
+  return _mm512_reduce_min_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) {
+  return _mm512_reduce_max_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
+  return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16h -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet16h& from) {
+  return half(predux(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) {
+  return half(predux_mul(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) {
+  return half(predux_min(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_min<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet16h& from) {
+  return half(predux_min<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) {
+  return half(predux_max(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_max<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet16h& from) {
+  return half(predux_max<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16h& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16bf& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX512_H
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@ -129,30 +129,20 @@ EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a)
 }

 #ifdef EIGEN_VECTORIZE_VSX
-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers.
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 template <>
 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_STRICT_AT_LEAST(7, 1, 0)
-  return vec_cts(x, 0);  // TODO: check clang version.
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = {static_cast<long long>(tmp[0]), static_cast<long long>(tmp[1])};
-  return l;
-#endif
+  EIGEN_ALIGN_MAX double dtmp[2];
+  pstore(dtmp, x);
+  EIGEN_ALIGN_MAX long long itmp[2] = {static_cast<long long>(dtmp[0]), static_cast<long long>(dtmp[1])};
+  return vec_xl(0, itmp);
 }

 template <>
 inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
-  unsigned long long tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2d d = {static_cast<double>(tmp[0]), static_cast<double>(tmp[1])};
-  return d;
+  EIGEN_ALIGN_MAX long long itmp[2];
+  vec_xst(x, 0, itmp);
+  EIGEN_ALIGN_MAX double dtmp[2] = {static_cast<double>(itmp[0]), static_cast<double>(itmp[1])};
+  return pload<Packet2d>(dtmp);
 }
 #endif

--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -1689,7 +1689,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const
 }

 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
    using Scalar = typename unpacket_traits<Packet>::type;
@ -1705,7 +1706,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
 };

 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                           NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
                                           NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@ -1724,7 +1726,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
 };

 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
                                           NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@ -1739,7 +1742,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P

 // \internal \returns the the sign of a complex number z, defined as z / abs(z).
 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                           unpacket_traits<Packet>::vectorizable>> {
  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
    typedef typename unpacket_traits<Packet>::type Scalar;
@ -2176,7 +2180,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, c

 // Generic implementation of pow(x,y).
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
+    const Packet& x, const Packet& y) {
  typedef typename unpacket_traits<Packet>::type Scalar;

  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
@ -2266,6 +2271,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Pac
  return pow;
 }

+template <typename Scalar>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
+    const Scalar& x, const Scalar& y) {
+  return numext::pow(x, y);
+}
+
 namespace unary_pow {

 template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
@ -2347,35 +2358,36 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const Scal
 }

 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
-                                                     const typename unpacket_traits<Packet>::type& exponent) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
+    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
  const Packet exponent_packet = pset1<Packet>(exponent);
  return generic_pow_impl(x, exponent_packet);
 }

+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
+    const Scalar& x, const Scalar& exponent) {
+  return numext::pow(x, exponent);
+}
+
 template <typename Packet, typename ScalarExponent>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
                                                                         const ScalarExponent& exponent) {
  using Scalar = typename unpacket_traits<Packet>::type;

  // non-integer base and exponent case
-
-  const Scalar pos_zero = Scalar(0);
-  const Scalar all_ones = ptrue<Scalar>(Scalar());
-  const Scalar pos_one = Scalar(1);
-  const Scalar pos_inf = NumTraits<Scalar>::infinity();
-
  const Packet cst_pos_zero = pzero(x);
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-  const Packet cst_pos_inf = pset1<Packet>(pos_inf);
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_true = ptrue<Packet>(x);

  const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
  const bool exponent_is_neg = exponent < ScalarExponent(0);
  const bool exponent_is_pos = exponent > ScalarExponent(0);

-  const Packet exp_is_not_fin = pset1<Packet>(exponent_is_not_fin ? all_ones : pos_zero);
-  const Packet exp_is_neg = pset1<Packet>(exponent_is_neg ? all_ones : pos_zero);
-  const Packet exp_is_pos = pset1<Packet>(exponent_is_pos ? all_ones : pos_zero);
+  const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
+  const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
+  const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
  const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
  const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));

@ -2411,22 +2423,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Pack

  // This routine handles negative exponents.
  // The return value is either 0, 1, or -1.
-
-  const Scalar pos_zero = Scalar(0);
-  const Scalar all_ones = ptrue<Scalar>(Scalar());
-  const Scalar pos_one = Scalar(1);
-
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
-
-  const Packet exp_is_odd = pset1<Packet>(exponent_is_odd ? all_ones : pos_zero);
+  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);

  const Packet abs_x = pabs(x);
  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);

  Packet result = pselect(exp_is_odd, x, abs_x);
-  result = pand(abs_x_is_one, result);
+  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
  return result;
 }

--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@ -497,16 +497,56 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
  a = half(float(a) / float(b));
  return a;
 }
+
+// Non-negative floating point numbers have a monotonic mapping to non-negative integers.
+// This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
+// is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
+// representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
+// two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
+// infinity) are handled automatically, except NaN.
+//
+// fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
+// bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
+// NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
+// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+
+// convert sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
+  return (a >> 15) ? -(a & kAbsMask) : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
-  return numext::equal_strict(float(a), float(b));
+  bool result = mapToSigned(a.x) == mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
-  return numext::not_equal_strict(float(a), float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) < mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) <= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) > mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) >= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }

 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
@ -706,7 +746,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
+#else
+  return (a.x & 0x7fff) < 0x7c00;
+#endif
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@ -31,6 +31,15 @@ namespace internal {
 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
 #endif

+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
+#else
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
+#endif
+
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
@ -74,7 +83,10 @@ struct packet_traits<float> : default_packet_traits {
    HasGammaSampleDerAlpha = 1,
    HasIGammac = 1,
    HasBetaInc = 1,
-    HasBlend = 0
+
+    HasBlend = 0,
+    HasFloor = 1,
+    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
  };
 };

@ -143,10 +155,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from)
  return make_double2(from, from);
 }

-// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
-// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
-// of the functions, while the latter can only deal with one of them.
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
  return __int_as_float(__float_as_int(a) & __float_as_int(b));
@ -259,8 +268,7 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
  return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
 }
-#endif  // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG &&
-        // !EIGEN_COMP_NVCC)
+#endif  // EIGEN_HAS_GPU_DEVICE_FUNCTIONS

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -1287,6 +1287,14 @@ template <>
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
  return vfma_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfms_f32(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
@ -1296,7 +1304,31 @@ template <>
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
  return vmla_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmls_f32(c, a, b);
+}
 #endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pmadd(a, b, c));
+}

 // No FMA instruction for int, so use MLA unconditionally.
 template <>
@ -5242,13 +5274,28 @@ template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
  return vfmaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmsq_f64(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
  return vmlaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlsq_f64(c, a, b);
+}
 #endif
-
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pmadd(a, b, c));
+}
 template <>
 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
  return vminq_f64(a, b);
@ -5657,18 +5704,33 @@ EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, cons
 }

 template <>
-EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
-  return vfmaq_f16(pnegate(c), a, b);
+EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmsq_f16(c, a, b);
 }

 template <>
 EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
-  return vfma_f16(c, pnegate(a), b);
+  return vfms_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pmadd(a, b, c));
 }

 template <>
 EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
-  return vfma_f16(pnegate(c), pnegate(a), b);
+  return pnegate(pmadd(a, b, c));
 }

 template <>
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -1857,220 +1857,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }

-template <>
-EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
-  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
-  // (from Nehalem to Haswell)
-  // #ifdef EIGEN_VECTORIZE_SSE3
-  //   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
-  //   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
-  // #else
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-  // #endif
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
-  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
-  // (from Nehalem to Haswell)
-  // #ifdef EIGEN_VECTORIZE_SSE3
-  //   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
-  // #else
-  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
-  // #endif
-}
-
-template <>
-EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
-  return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
-}
-
-#ifdef EIGEN_VECTORIZE_SSSE3
-template <>
-EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
-  Packet4i tmp0 = _mm_hadd_epi32(a, a);
-  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
-  Packet4ui tmp0 = _mm_hadd_epi32(a, a);
-  return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
-}
-#else
-template <>
-EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
-  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
-  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
-  Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
-  return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
-}
-#endif
-
-template <>
-EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
-  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
-}
-
-// Other reduction functions:
-
-// mul
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
-  EIGEN_ALIGN16 int64_t aux[2];
-  pstore(aux, a);
-  return aux[0] * aux[1];
-}
-template <>
-EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (e.g., reusing pmul is very slow!)
-  // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., reusing pmul is very slow !)
-  // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
-  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
-}
-
-// min
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
-  return aux0 < aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
-  return aux0 < aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-
-// max
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
-  return aux0 > aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
-  return aux0 > aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
-// {
-//   return _mm_movemask_ps(x) == 0xF;
-// }
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) {
-  return _mm_movemask_pd(x) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
-  return _mm_movemask_ps(x) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) {
-  return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
-  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
-}
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
-  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
-}
-
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
 }
--- a/Eigen/src/Core/arch/SSE/Reductions.h
+++ b/Eigen/src/Core/arch/SSE/Reductions.h
@ -0,0 +1,324 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_SSE_H
+#define EIGEN_REDUCTIONS_SSE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Packet>
+struct sse_add_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return padd<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_mul_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmul<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_min_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmin<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_min_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmin<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet>
+struct sse_max_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmax<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_max_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmax<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet, typename Op>
+struct sse_predux_common;
+
+template <typename Packet>
+struct sse_predux_impl : sse_predux_common<Packet, sse_add_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_mul_impl : sse_predux_common<Packet, sse_mul_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_min_impl : sse_predux_common<Packet, sse_min_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_min_prop_impl : sse_predux_common<Packet, sse_min_prop_wrapper<NaNPropagation, Packet>> {};
+
+template <typename Packet>
+struct sse_predux_max_impl : sse_predux_common<Packet, sse_max_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_max_prop_impl : sse_predux_common<Packet, sse_max_prop_wrapper<NaNPropagation, Packet>> {};
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16b -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bool predux(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_min(const Packet16b& a) {
+  return predux_mul(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_max(const Packet16b& a) {
+  return predux(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16b& a) {
+  return predux(a);
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4i, Op> {
+  static EIGEN_STRONG_INLINE int run(const Packet4i& a) {
+    Packet4i tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return _mm_cvtsi128_si32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet4i& a) {
+  return sse_predux_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) {
+  return sse_predux_mul_impl<Packet4i>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) {
+  return sse_predux_min_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) {
+  return sse_predux_max_impl<Packet4i>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4i& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4ui, Op> {
+  static EIGEN_STRONG_INLINE uint32_t run(const Packet4ui& a) {
+    Packet4ui tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(tmp));
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) {
+  return sse_predux_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) {
+  return sse_predux_mul_impl<Packet4ui>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) {
+  return sse_predux_min_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) {
+  return sse_predux_max_impl<Packet4ui>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2l, Op> {
+  static EIGEN_STRONG_INLINE int64_t run(const Packet2l& a) {
+    Packet2l tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_epi64(a, a));
+    return pfirst(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) {
+  return sse_predux_impl<Packet2l>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2l& a) {
+  return _mm_movemask_pd(_mm_castsi128_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4f, Op> {
+  static EIGEN_STRONG_INLINE float run(const Packet4f& a) {
+    Packet4f tmp;
+    tmp = Op::packetOp(a, _mm_movehl_ps(a, a));
+#ifdef EIGEN_VECTORIZE_SSE3
+    tmp = Op::packetOp(tmp, _mm_movehdup_ps(tmp));
+#else
+    tmp = Op::packetOp(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+#endif
+    return _mm_cvtss_f32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet4f& a) {
+  return sse_predux_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) {
+  return sse_predux_mul_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) {
+  return sse_predux_min_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) {
+  return sse_predux_max_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& a) {
+  return _mm_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2d, Op> {
+  static EIGEN_STRONG_INLINE double run(const Packet2d& a) {
+    Packet2d tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_pd(a, a));
+    return _mm_cvtsd_f64(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet2d& a) {
+  return sse_predux_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) {
+  return sse_predux_mul_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) {
+  return sse_predux_min_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) {
+  return sse_predux_max_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2d& a) {
+  return _mm_movemask_pd(a) != 0x0;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_SSE_H
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@ -55,7 +55,7 @@ namespace internal {
                                       ConjugateRhs, ColMajor, 1> {                                                 \
    typedef gebp_traits<EIGTYPE, EIGTYPE> Traits;                                                                   \
                                                                                                                    \
-    static void run(Index rows, Index cols, Index depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
+    static void run(Index rows, Index cols, Index depth, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_, \
                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                   \
                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, GemmParallelInfo<Index>* /*info = 0*/) {       \
      using std::conj;                                                                                              \
@ -84,20 +84,20 @@ namespace internal {
                                                                                                                    \
      /* Set a, b, c */                                                                                             \
      if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, m, k, OuterStride<>(lhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));                 \
        a_tmp = lhs.conjugate();                                                                                    \
        a = a_tmp.data();                                                                                           \
        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
      } else                                                                                                        \
-        a = _lhs;                                                                                                   \
+        a = lhs_;                                                                                                   \
                                                                                                                    \
      if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, k, n, OuterStride<>(rhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));                 \
        b_tmp = rhs.conjugate();                                                                                    \
        b = b_tmp.data();                                                                                           \
        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                        \
      } else                                                                                                        \
-        b = _rhs;                                                                                                   \
+        b = rhs_;                                                                                                   \
                                                                                                                    \
      BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,   \
               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);           \
@ -116,6 +116,88 @@ GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
 GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
 #endif

+// If OpenBLAS with BUILD_BFLOAT16=1 support is available,
+// use sbgemm for bfloat16.
+#if EIGEN_USE_OPENBLAS_BFLOAT16
+
+extern "C" {
+// OpenBLAS prototype.
+void sbgemm_(const char* trans_a, const char* trans_b, const int* M, const int* N, const int* K, const float* alpha,
+             const Eigen::bfloat16* A, const int* lda, const Eigen::bfloat16* B, const int* ldb, const float* beta,
+             float* C, const int* ldc);
+}  // extern "C"
+
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, ConjugateLhs, Eigen::bfloat16,
+                                     RhsStorageOrder, ConjugateRhs, ColMajor, 1> {
+  typedef gebp_traits<Eigen::bfloat16, Eigen::bfloat16> Traits;
+
+  static void run(Index rows, Index cols, Index depth, const Eigen::bfloat16* lhs_, Index lhsStride,
+                  const Eigen::bfloat16* rhs_, Index rhsStride, Eigen::bfloat16* res, Index resIncr, Index resStride,
+                  Eigen::bfloat16 alpha, level3_blocking<Eigen::bfloat16, Eigen::bfloat16>& /*blocking*/,
+                  GemmParallelInfo<Index>* /*info = 0*/) {
+    using std::conj;
+    if (rows == 0 || cols == 0 || depth == 0) return;
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr);
+    eigen_assert(resIncr == 1);
+    char transa, transb;
+    BlasIndex m, n, k, lda, ldb, ldc;
+    const Eigen::bfloat16 *a, *b;
+
+    float falpha = static_cast<float>(alpha);
+    float fbeta = float(1.0);
+
+    using MatrixXbf16 = Matrix<Eigen::bfloat16, Dynamic, Dynamic>;
+    MatrixXbf16 a_tmp, b_tmp;
+    MatrixXf r_tmp;
+
+    /* Set transpose options */
+    transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';
+    transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';
+
+    /* Set m, n, k */
+    m = convert_index<BlasIndex>(rows);
+    n = convert_index<BlasIndex>(cols);
+    k = convert_index<BlasIndex>(depth);
+
+    /* Set lda, ldb, ldc */
+    lda = convert_index<BlasIndex>(lhsStride);
+    ldb = convert_index<BlasIndex>(rhsStride);
+    ldc = convert_index<BlasIndex>(m);
+
+    /* Set a, b, c */
+    if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));
+      a_tmp = lhs.conjugate();
+      a = a_tmp.data();
+      lda = convert_index<BlasIndex>(a_tmp.outerStride());
+    } else {
+      a = lhs_;
+    }
+
+    if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));
+      b_tmp = rhs.conjugate();
+      b = b_tmp.data();
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());
+    } else {
+      b = rhs_;
+    }
+
+    // Evaluate to a temporary intermediate array.
+    r_tmp.resize(m, n);
+
+    sbgemm_(&transa, &transb, &m, &n, &k, (const float*)&numext::real_ref(falpha), a, &lda, b, &ldb,
+            (const float*)&numext::real_ref(fbeta), r_tmp.data(), &ldc);
+
+    // Cast to the output.
+    Map<MatrixXbf16, 0, OuterStride<> > result(res, m, n, OuterStride<>(resStride));
+    result = r_tmp.cast<Eigen::bfloat16>();
+  }
+};
+
+#endif  // EIGEN_USE_OPENBLAS_SBGEMM
+
 }  // namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -164,6 +164,11 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {

  enum { LhsUpLo = LhsMode & (Upper | Lower) };

+  // Verify that the Rhs is a vector in the correct orientation.
+  // Otherwise, we break the assumption that we are multiplying
+  // MxN * Nx1.
+  static_assert(Rhs::ColsAtCompileTime == 1, "The RHS must be a column vector.");
+
  template <typename Dest>
  static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
    typedef typename Dest::Scalar ResScalar;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@ -8,7 +8,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
 #define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

 // Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
@ -98,4 +98,4 @@

 #endif  // gpu_assert

-#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@ -8,7 +8,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#if defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)

 #ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES

@ -40,6 +40,6 @@

 #endif  // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES

-#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+#undef EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H

-#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -762,7 +762,7 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
 * This is accomplished through alloca if this later is supported and if the required number of bytes
 * is below EIGEN_STACK_ALLOCATION_LIMIT.
 */
-#ifdef EIGEN_ALLOCA
+#if defined(EIGEN_ALLOCA) && !defined(EIGEN_NO_ALLOCA)

 #if EIGEN_DEFAULT_ALIGN_BYTES > 0
 // We always manually re-align the result of EIGEN_ALLOCA.
@ -785,14 +785,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* pt
 #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
 #endif

-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                     \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                             \
-  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                             \
-                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * SIZE <= EIGEN_STACK_ALLOCATION_LIMIT)        \
-                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * SIZE)              \
-                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                    \
-      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * SIZE > EIGEN_STACK_ALLOCATION_LIMIT)
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                       \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                               \
+  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                               \
+                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * (SIZE) <= EIGEN_STACK_ALLOCATION_LIMIT)        \
+                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * (SIZE))              \
+                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                      \
+      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * (SIZE) > EIGEN_STACK_ALLOCATION_LIMIT)

 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME)                                        \
  Eigen::internal::local_nested_eval_wrapper<XPR_T, N> EIGEN_CAT(NAME, _wrapper)(                \
@ -805,10 +805,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* pt

 #else

-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                        \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                                \
-  TYPE* NAME = (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                       \
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                 \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                         \
+  TYPE* NAME =                                                                                                  \
+      (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                \
      (BUFFER) == 0 ? NAME : 0, SIZE, true)

 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@ -345,7 +345,7 @@ EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorT

    // Apply similarity transformation to remaining columns,
    // i.e., A = H A H' where H = I - h v v' and v = matA.col(i).tail(n-i-1)
-    matA.col(i).coeffRef(i + 1) = (RealScalar)1;
+    matA.col(i).coeffRef(i + 1) = Scalar(1);

    hCoeffs.tail(n - i - 1).noalias() =
        (matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() *
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@ -85,6 +85,29 @@ class QuaternionBase : public RotationBase<Derived, 3> {
    return derived().coeffs();
  }

+  /** \returns a vector containing the coefficients, rearranged into the order [\c w, \c x, \c y, \c z].
+   *
+   * This is the order expected by the \code Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar&
+   * z) \endcode constructor, but not the order of the internal vector representation. Therefore, it returns a newly
+   * constructed vector.
+   *
+   * \sa QuaternionBase::coeffsScalarLast()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarFirst() const {
+    return derived().coeffsScalarFirst();
+  }
+
+  /** \returns a vector containing the coefficients in their original order [\c x, \c y, \c z, \c w].
+   *
+   * This is equivalent to \code coeffs() \endcode, but returns a newly constructed vector for uniformity with \code
+   * coeffsScalarFirst() \endcode.
+   *
+   * \sa QuaternionBase::coeffsScalarFirst()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarLast() const {
+    return derived().coeffsScalarLast();
+  }
+
  /** \returns a vector expression of the coefficients (x,y,z,w) */
  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }

@ -357,12 +380,23 @@ class Quaternion : public QuaternionBase<Quaternion<Scalar_, Options_> > {

  EIGEN_DEVICE_FUNC static Quaternion UnitRandom();

+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarLast(const Scalar& x, const Scalar& y, const Scalar& z,
+                                                           const Scalar& w);
+
+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarFirst(const Scalar& w, const Scalar& x, const Scalar& y,
+                                                            const Scalar& z);
+
  template <typename Derived1, typename Derived2>
  EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);

  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }

+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))

 #ifdef EIGEN_QUATERNION_PLUGIN
@ -437,6 +471,12 @@ class Map<const Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<const

  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }

+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
 protected:
  const Coefficients m_coeffs;
 };
@ -473,6 +513,12 @@ class Map<Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<Quaternion<
  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }

+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
 protected:
  Coefficients m_coeffs;
 };
@ -694,6 +740,35 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::UnitR
  return Quaternion(a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }

+/** Constructs a quaternion from its coefficients in the order [\c x, \c y, \c z, \c w], i.e. vector part [\c x, \c y,
+ * \c z] first, scalar part \a w LAST.
+ *
+ * This factory accepts the parameters in the same order as the underlying coefficient vector. Consider using this
+ * factory function to make the parameter ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarLast(const Scalar& x,
+                                                                                                const Scalar& y,
+                                                                                                const Scalar& z,
+                                                                                                const Scalar& w) {
+  return Quaternion(w, x, y, z);
+}
+
+/** Constructs a quaternion from its coefficients in the order [\c w, \c x, \c y, \c z], i.e. scalar part \a w FIRST,
+ * vector part [\c x, \c y, \c z] last.
+ *
+ * This factory accepts the parameters in the same order as the constructor \code Quaternion(const Scalar& w, const
+ * Scalar& x, const Scalar& y, const Scalar& z) \endcode. Consider using this factory function to make the parameter
+ * ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarFirst(const Scalar& w,
+                                                                                                 const Scalar& x,
+                                                                                                 const Scalar& y,
+                                                                                                 const Scalar& z) {
+  return Quaternion(w, x, y, z);
+}
+
 /** Returns a quaternion representing a rotation between
 * the two arbitrary vectors \a a and \a b. In other words, the built
 * rotation represent a rotation sending the line of direction \a a
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -78,6 +78,17 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
  typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> PermutationPType;
  typedef typename MatrixType::PlainObject PlainObject;

+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
+    return Success;
+  }
+
  /**
   * \brief Default Constructor.
   *
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@ -268,7 +268,7 @@ struct Assignment<DstXprType, Inverse<XprType>,
 * \note This matrix must be invertible, otherwise the result is undefined. If you need an
 * invertibility check, do the following:
 * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
- * \li for the general case, use class FullPivLU.
+ * \li for the general case, use class PartialPivLU.
 *
 * Example: \include MatrixBase_inverse.cpp
 * Output: \verbinclude MatrixBase_inverse.out
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@ -90,6 +90,17 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
  typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> TranspositionType;
  typedef typename MatrixType::PlainObject PlainObject;

+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return Success;
+  }
+
  /**
   * \brief Default Constructor.
   *
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@ -82,6 +82,17 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
  typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
  typedef typename MatrixType::PlainObject PlainObject;

+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return Success;
+  }
+
  /** \brief Default Constructor.
   *
   * The default constructor is useful in cases in which the user intends to
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@ -75,6 +75,17 @@ class HouseholderQR : public SolverBase<HouseholderQR<MatrixType_>> {
  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
      HouseholderSequenceType;

+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "HouseHolderQR is not initialized.");
+    return Success;
+  }
+
  /**
   * \brief Default Constructor.
   *
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@ -165,7 +165,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   *
   * \param matrix the matrix to decompose
   */
-  BDCSVD(const MatrixType& matrix) : m_algoswap(16), m_numIters(0) {
+  template <typename Derived>
+  BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
    compute_impl(matrix, internal::get_computation_options(Options));
  }

@ -181,7 +182,9 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   * \deprecated Will be removed in the next major Eigen version. Options should
   * be specified in the \a Options template parameter.
   */
-  EIGEN_DEPRECATED BDCSVD(const MatrixType& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  template <typename Derived>
+  EIGEN_DEPRECATED BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions)
+      : m_algoswap(16), m_numIters(0) {
    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
    compute_impl(matrix, computationOptions);
  }
@ -193,7 +196,10 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   *
   * \param matrix the matrix to decompose
   */
-  BDCSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
+  template <typename Derived>
+  BDCSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }

  /** \brief Method performing the decomposition of given matrix, as specified by
   *         the `computationOptions` parameter.
@ -204,7 +210,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   * \deprecated Will be removed in the next major Eigen version. Options should
   * be specified in the \a Options template parameter.
   */
-  EIGEN_DEPRECATED BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
+  template <typename Derived>
+  EIGEN_DEPRECATED BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
    return compute_impl(matrix, computationOptions);
  }
@ -215,7 +222,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
  }

 private:
-  BDCSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
+  template <typename Derived>
+  BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
@ -307,8 +315,13 @@ void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int
 }  // end allocate

 template <typename MatrixType, int Options>
-BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
+template <typename Derived>
+BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
                                                                       unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
  std::cout << "\n\n\n================================================================================================="
               "=====================\n\n\n";
--- a/Eigen/src/SVD/BDCSVD_LAPACKE.h
+++ b/Eigen/src/SVD/BDCSVD_LAPACKE.h
@ -58,7 +58,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
  // construct this by moving from a parent object
  BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {}

-  void compute_impl_lapacke(const MatrixType& matrix, unsigned int computationOptions) {
+  template <typename Derived>
+  void compute_impl_lapacke(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
    SVD::allocate(matrix.rows(), matrix.cols(), computationOptions);

    SVD::m_nonzeroSingularValues = SVD::m_diagSize;
@ -120,8 +121,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
  }
 };

-template <typename MatrixType_, int Options>
-BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixType_& matrix,
+template <typename MatrixType_, int Options, typename Derived>
+BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixBase<Derived>& matrix,
                                             int computationOptions) {
  // we need to move to the wrapper type and back
  BDCSVD_LAPACKE<MatrixType_, Options> tmpSvd(std::move(svd));
@ -134,12 +135,13 @@ BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd,

 }  // end namespace internal

-#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS)                                                                 \
-  template <>                                                                                                          \
-  inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                              \
-  BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                       \
-      const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
-    return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions);                               \
+#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS)                                           \
+  template <>                                                                                    \
+  template <typename Derived>                                                                    \
+  inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&        \
+  BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                      \
+    return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions);         \
  }

 #define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS)        \
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -565,7 +565,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   *
   * \param matrix the matrix to decompose
   */
-  explicit JacobiSVD(const MatrixType& matrix) { compute_impl(matrix, internal::get_computation_options(Options)); }
+  template <typename Derived>
+  explicit JacobiSVD(const MatrixBase<Derived>& matrix) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }

  /** \brief Constructor performing the decomposition of given matrix using specified options
   *         for computing unitaries.
@ -580,8 +583,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   * be specified in the \a Options template parameter.
   */
  // EIGEN_DEPRECATED // TODO(cantonios): re-enable after fixing a few 3p libraries that error on deprecation warnings.
-  JacobiSVD(const MatrixType& matrix, unsigned int computationOptions) {
-    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
+  template <typename Derived>
+  JacobiSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(computationOptions, matrix.rows(),
+                                                                         matrix.cols());
    compute_impl(matrix, computationOptions);
  }

@ -590,7 +595,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   *
   * \param matrix the matrix to decompose
   */
-  JacobiSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
+  template <typename Derived>
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }

  /** \brief Method performing the decomposition of given matrix, as specified by
   *         the `computationOptions` parameter.
@ -601,8 +609,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   * \deprecated Will be removed in the next major Eigen version. Options should
   * be specified in the \a Options template parameter.
   */
-  EIGEN_DEPRECATED JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
-    internal::check_svd_options_assertions<MatrixType, Options>(m_computationOptions, matrix.rows(), matrix.cols());
+  template <typename Derived>
+  EIGEN_DEPRECATED JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(m_computationOptions, matrix.rows(),
+                                                                         matrix.cols());
    return compute_impl(matrix, computationOptions);
  }

@ -626,7 +636,8 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
  }

 private:
-  JacobiSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
+  template <typename Derived>
+  JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);

 protected:
  using Base::m_computationOptions;
@ -664,8 +675,13 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
 };

 template <typename MatrixType, int Options>
-JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
+template <typename Derived>
+JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
                                                                             unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
  using std::abs;

  allocate(matrix.rows(), matrix.cols(), computationOptions);
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@ -40,65 +40,65 @@ namespace Eigen {

 /** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS)    \
-  template <>                                                                                                          \
-  inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                           \
-  JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                    \
-      const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
-    typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType;                                 \
-    /*typedef MatrixType::Scalar Scalar;*/                                                                             \
-    /*typedef MatrixType::RealScalar RealScalar;*/                                                                     \
-    allocate(matrix.rows(), matrix.cols(), computationOptions);                                                        \
-                                                                                                                       \
-    /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/                                     \
-    m_nonzeroSingularValues = diagSize();                                                                              \
-                                                                                                                       \
-    lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt;                             \
-    lapack_int matrix_order = LAPACKE_COLROW;                                                                          \
-    char jobu, jobvt;                                                                                                  \
-    LAPACKE_TYPE *u, *vt, dummy;                                                                                       \
-    jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N';                                                      \
-    jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N';                                                     \
-    if (computeU()) {                                                                                                  \
-      ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride());                                              \
-      u = (LAPACKE_TYPE*)m_matrixU.data();                                                                             \
-    } else {                                                                                                           \
-      ldu = 1;                                                                                                         \
-      u = &dummy;                                                                                                      \
-    }                                                                                                                  \
-    MatrixType localV;                                                                                                 \
-    lapack_int vt_rows = (m_computeFullV)   ? internal::convert_index<lapack_int>(cols())                              \
-                         : (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize())                          \
-                                            : 1;                                                                       \
-    if (computeV()) {                                                                                                  \
-      localV.resize(vt_rows, cols());                                                                                  \
-      ldvt = internal::convert_index<lapack_int>(localV.outerStride());                                                \
-      vt = (LAPACKE_TYPE*)localV.data();                                                                               \
-    } else {                                                                                                           \
-      ldvt = 1;                                                                                                        \
-      vt = &dummy;                                                                                                     \
-    }                                                                                                                  \
-    Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb;                                                                    \
-    superb.resize(diagSize(), 1);                                                                                      \
-    MatrixType m_temp;                                                                                                 \
-    m_temp = matrix;                                                                                                   \
-    lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd(                                                                 \
-        matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()),                                        \
-        internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda,                                \
-        (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data());                                     \
-    /* Check the result of the LAPACK call */                                                                          \
-    if (info < 0 || !m_singularValues.allFinite()) {                                                                   \
-      m_info = InvalidInput;                                                                                           \
-    } else if (info > 0) {                                                                                             \
-      m_info = NoConvergence;                                                                                          \
-    } else {                                                                                                           \
-      m_info = Success;                                                                                                \
-      if (computeV()) m_matrixV = localV.adjoint();                                                                    \
-    }                                                                                                                  \
-    /* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--;        \
-     * m_singularValues.coeffRef(i)=RealScalar(0);}*/                                                                  \
-    m_isInitialized = true;                                                                                            \
-    return *this;                                                                                                      \
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \
+  template <>                                                                                                       \
+  template <typename Derived>                                                                                       \
+  inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                        \
+  JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                 \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                                         \
+    /*typedef MatrixType::Scalar Scalar;*/                                                                          \
+    /*typedef MatrixType::RealScalar RealScalar;*/                                                                  \
+    allocate(matrix.rows(), matrix.cols(), computationOptions);                                                     \
+                                                                                                                    \
+    /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/                                  \
+    m_nonzeroSingularValues = diagSize();                                                                           \
+                                                                                                                    \
+    lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt;                          \
+    lapack_int matrix_order = LAPACKE_COLROW;                                                                       \
+    char jobu, jobvt;                                                                                               \
+    LAPACKE_TYPE *u, *vt, dummy;                                                                                    \
+    jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N';                                                   \
+    jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N';                                                  \
+    if (computeU()) {                                                                                               \
+      ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride());                                           \
+      u = (LAPACKE_TYPE*)m_matrixU.data();                                                                          \
+    } else {                                                                                                        \
+      ldu = 1;                                                                                                      \
+      u = &dummy;                                                                                                   \
+    }                                                                                                               \
+    MatrixType localV;                                                                                              \
+    lapack_int vt_rows = (m_computeFullV)   ? internal::convert_index<lapack_int>(cols())                           \
+                         : (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize())                       \
+                                            : 1;                                                                    \
+    if (computeV()) {                                                                                               \
+      localV.resize(vt_rows, cols());                                                                               \
+      ldvt = internal::convert_index<lapack_int>(localV.outerStride());                                             \
+      vt = (LAPACKE_TYPE*)localV.data();                                                                            \
+    } else {                                                                                                        \
+      ldvt = 1;                                                                                                     \
+      vt = &dummy;                                                                                                  \
+    }                                                                                                               \
+    Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb;                                                                 \
+    superb.resize(diagSize(), 1);                                                                                   \
+    MatrixType m_temp;                                                                                              \
+    m_temp = matrix;                                                                                                \
+    lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd(                                                              \
+        matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()),                                     \
+        internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda,                             \
+        (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data());                                  \
+    /* Check the result of the LAPACK call */                                                                       \
+    if (info < 0 || !m_singularValues.allFinite()) {                                                                \
+      m_info = InvalidInput;                                                                                        \
+    } else if (info > 0) {                                                                                          \
+      m_info = NoConvergence;                                                                                       \
+    } else {                                                                                                        \
+      m_info = Success;                                                                                             \
+      if (computeV()) m_matrixV = localV.adjoint();                                                                 \
+    }                                                                                                               \
+    /* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--;     \
+     * m_singularValues.coeffRef(i)=RealScalar(0);}*/                                                               \
+    m_isInitialized = true;                                                                                         \
+    return *this;                                                                                                   \
  }

 #define EIGEN_LAPACK_SVD_OPTIONS(OPTIONS)                                                            \
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@ -274,6 +274,10 @@ struct simpl_chol_helper {
  }
 };

+// Symbol is ODR-used, so we need a definition.
+template <typename Scalar, typename StorageIndex>
+constexpr StorageIndex simpl_chol_helper<Scalar, StorageIndex>::kEmpty;
+
 }  // namespace internal

 template <typename Derived>
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h
@ -36,10 +36,10 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
  Scalar res1(0);
  Scalar res2(0);
  for (; i; ++i) {
-    res1 += numext::conj(i.value()) * other.coeff(i.index());
+    res1 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res1);
    ++i;
    if (i) {
-      res2 += numext::conj(i.value()) * other.coeff(i.index());
+      res2 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res2);
    }
  }
  return res1 + res2;
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@ -7,9 +7,7 @@
  script:
    - . ci/scripts/build.linux.script.sh
  tags:
-    - linux
-    - eigen-runner
-    - cross-compiler
+    - saas-linux-2xlarge-amd64
  rules:
    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
@ -244,11 +242,13 @@ build:linux:rocm-latest:gcc-10:
    EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
    EIGEN_CI_ADDITIONAL_ARGS: >
      -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv7-a;-mfpu=neon-vfpv4
+      -DCMAKE_SYSTEM_NAME=Linux
+      -DCMAKE_CROSSCOMPILING_EMULATOR=qemu-arm-static;-L;/usr/arm-linux-gnueabihf

 build:linux:cross:arm:gcc-10:default:
  extends: .build:linux:cross:arm
  variables:
-    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf qemu-user-static
    EIGEN_CI_CROSS_C_COMPILER: arm-linux-gnueabihf-gcc-10
    EIGEN_CI_CROSS_CXX_COMPILER: arm-linux-gnueabihf-g++-10

@ -258,7 +258,7 @@ build:linux:cross:arm:clang-12:default:
    EIGEN_CI_INSTALL: clang-12
    EIGEN_CI_C_COMPILER: clang-12
    EIGEN_CI_CXX_COMPILER: clang++-12
-    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12 qemu-user-static

 ######## aarch64 ###############################################################

@ -268,6 +268,8 @@ build:linux:cross:arm:clang-12:default:
    EIGEN_CI_TARGET_ARCH: aarch64
    EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
    EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv8.2-a+fp16
+  tags:
+    - saas-linux-large-arm64

 build:linux:cross:aarch64:gcc-10:default:
  extends: .build:linux:cross:aarch64
@ -290,28 +292,27 @@ build:linux:cross:aarch64:clang-12:default:

 .build:linux:cross:ppc64le:
  extends: .build:linux:cross
+  image: ubuntu:24.04
  variables:
    EIGEN_CI_TARGET_ARCH: ppc64le
    EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
+    EIGEN_CI_ADDITIONAL_ARGS: >-
+      -DCMAKE_SYSTEM_NAME=Linux
+      -DCMAKE_CROSSCOMPILING_EMULATOR=qemu-ppc64le-static;-L;/usr/powerpc64le-linux-gnu

-build:linux:cross:ppc64le:gcc-10:default:
+build:linux:cross:ppc64le:gcc-14:default:
  extends: .build:linux:cross:ppc64le
  variables:
-    EIGEN_CI_C_COMPILER: gcc-10
-    EIGEN_CI_CXX_COMPILER: g++-10
-    EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu
-    EIGEN_CI_CROSS_C_COMPILER: powerpc64le-linux-gnu-gcc-10
-    EIGEN_CI_CROSS_CXX_COMPILER: powerpc64le-linux-gnu-g++-10
-    # Temporarily disable MMA until #2457 is resolved.
-    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_ALTIVEC_DISABLE_MMA=1"
+    EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu qemu-user-static
+    EIGEN_CI_CROSS_C_COMPILER: powerpc64le-linux-gnu-gcc-14
+    EIGEN_CI_CROSS_CXX_COMPILER: powerpc64le-linux-gnu-g++-14

-build:linux:cross:ppc64le:clang-12:default:
+build:linux:cross:ppc64le:clang-16:default:
  extends: .build:linux:cross:ppc64le
  variables:
-    EIGEN_CI_INSTALL: clang-12
-    EIGEN_CI_C_COMPILER: clang-12
-    EIGEN_CI_CXX_COMPILER: clang++-12
-    EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu clang-12
+    EIGEN_CI_C_COMPILER: clang-16
+    EIGEN_CI_CXX_COMPILER: clang++-16
+    EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu clang-16 qemu-user-static

 ######## loongarch64 #################################################

@ -320,17 +321,13 @@ build:linux:cross:ppc64le:clang-12:default:
  variables:
    EIGEN_CI_TARGET_ARCH: loongarch64
    EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
-  tags:
-    - eigen-runner
-    - linux
-    - cross-compiler

 # GCC-14 (minimum on Ubuntu 24)
 build:linux:cross:loongarch64:gcc-14:default:
  extends: .build:linux:cross:loongarch64
  image: ubuntu:24.04
  variables:
-    EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu
+    EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu qemu-user-static
    EIGEN_CI_CROSS_C_COMPILER: loongarch64-linux-gnu-gcc-14
    EIGEN_CI_CROSS_CXX_COMPILER: loongarch64-linux-gnu-g++-14
    EIGEN_CI_ADDITIONAL_ARGS: >-
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@ -9,6 +9,8 @@
    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_MERGE_REQUEST_LABELS =~ "/all-tests/"
+  tags:
+    - saas-linux-2xlarge-amd64

 ##### x86-64 ###################################################################
 .test:linux:x86-64:
@ -16,10 +18,6 @@
  variables:
    EIGEN_CI_TARGET_ARCH: x86_64
    EIGEN_CI_CROSS_TARGET_TRIPLE: x86_64-linux-gnu
-  tags:
-    - eigen-runner
-    - linux
-    - x86-64

 # GCC-6 (minimum on Ubuntu 18.04)
 .test:linux:x86-64:gcc-6:default:
@ -289,18 +287,13 @@ test:linux:cuda-12.2:clang-12:
  variables:
    EIGEN_CI_TARGET_ARCH: arm
    EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
-    # Enable cross-compiled arm binary to run on aarch64.
-    EIGEN_CI_BEFORE_SCRIPT: "ln -s /usr/arm-linux-gnueabihf/lib/ld-linux-armhf.so.3 /lib/ && export LD_LIBRARY_PATH=/usr/arm-linux-gnueabihf/lib/"
-  tags:
-    - eigen-runner
-    - linux
-    - aarch64
+    EIGEN_CI_CTEST_ARGS: --timeout 2000

 .test:linux:arm:gcc-10:default:
  extends: .test:linux:arm
  needs: [ build:linux:cross:arm:gcc-10:default ]
  variables:
-    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf qemu-user-static

 test:linux:arm:gcc-10:default:official:
  extends: .test:linux:arm:gcc-10:default
@ -316,7 +309,7 @@ test:linux:arm:gcc-10:default:unsupported:
  extends: .test:linux:arm
  needs: [ build:linux:cross:arm:clang-12:default ]
  variables:
-    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12 qemu-user-static

 test:linux:arm:clang-12:default:official:
  extends: .test:linux:arm:clang-12:default
@ -336,9 +329,7 @@ test:linux:arm:clang-12:default:unsupported:
    EIGEN_CI_TARGET_ARCH: aarch64
    EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
  tags:
-    - eigen-runner
-    - linux
-    - aarch64
+    - saas-linux-large-arm64

 .test:linux:aarch64:gcc-10:default:
  extends: .test:linux:aarch64
@ -376,60 +367,54 @@ test:linux:aarch64:clang-12:default:unsupported:

 .test:linux:ppc64le:
  extends: .test:linux
+  image: ubuntu:24.04
  variables:
    EIGEN_CI_TARGET_ARCH: ppc64le
    EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
-  tags:
-    - eigen-runner
-    - linux
-    - ppc64le
+    EIGEN_CI_CTEST_ARGS: --timeout 2000

-.test:linux:ppc64le:gcc-10:default:
+.test:linux:ppc64le:gcc-14:default:
  extends: .test:linux:ppc64le
-  needs: [ build:linux:cross:ppc64le:gcc-10:default ]
+  needs: [ build:linux:cross:ppc64le:gcc-14:default ]
  variables:
-    EIGEN_CI_INSTALL: g++-10
+    EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu qemu-user-static

-test:linux:ppc64le:gcc-10:default:official:
-  extends: .test:linux:ppc64le:gcc-10:default
+test:linux:ppc64le:gcc-14:default:official:
+  extends: .test:linux:ppc64le:gcc-14:default
  variables:
    EIGEN_CI_CTEST_LABEL: Official

-test:linux:ppc64le:gcc-10:default:unsupported:
-  extends: .test:linux:ppc64le:gcc-10:default
+test:linux:ppc64le:gcc-14:default:unsupported:
+  extends: .test:linux:ppc64le:gcc-14:default
  variables:
    EIGEN_CI_CTEST_LABEL: Unsupported

-.test:linux:ppc64le:clang-12:default:
+.test:linux:ppc64le:clang-16:default:
  extends: .test:linux:ppc64le
-  needs: [ build:linux:cross:ppc64le:clang-12:default ]
+  needs: [ build:linux:cross:ppc64le:clang-16:default ]
  variables:
-    EIGEN_CI_INSTALL: clang-12
+    EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu clang-16 qemu-user-static

-test:linux:ppc64le:clang-12:default:official:
-  extends: .test:linux:ppc64le:clang-12:default
+test:linux:ppc64le:clang-16:default:official:
+  extends: .test:linux:ppc64le:clang-16:default
  variables:
    EIGEN_CI_CTEST_LABEL: Official

-test:linux:ppc64le:clang-12:default:unsupported:
-  extends: .test:linux:ppc64le:clang-12:default
+test:linux:ppc64le:clang-16:default:unsupported:
+  extends: .test:linux:ppc64le:clang-16:default
  variables:
    EIGEN_CI_CTEST_LABEL: Unsupported

-##### loongarch64 ###################################################################
+##### loongarch64 ##############################################################
+
 .test:linux:loongarch64:
  extends: .test:linux
  image: ubuntu:24.04
  variables:
    EIGEN_CI_TARGET_ARCH: loongarch64
    EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
-    # Install QEMU and set up the execution environment in the image
    EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu qemu-user-static
    EIGEN_CI_CTEST_ARGS: --timeout 2000
-  tags:
-    - eigen-runner
-    - linux
-    - cross-compiler

 # GCC-14 (Ubuntu 24)
 .test:linux:loongarch64:gcc-14:default:
--- a/test/AnnoyingScalar.h
+++ b/test/AnnoyingScalar.h
@ -16,7 +16,7 @@
 #pragma GCC diagnostic ignored "-Wshadow"
 #endif

-#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
 struct my_exception {
  my_exception() {}
  ~my_exception() {}
@ -76,7 +76,7 @@ class AnnoyingScalar {
  }

  AnnoyingScalar operator+(const AnnoyingScalar& other) const {
-#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
    countdown--;
    if (countdown <= 0 && !dont_throw) throw my_exception();
 #endif
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@ -1340,7 +1340,7 @@ EIGEN_DECLARE_TEST(array_cwise) {
    CALL_SUBTEST_3(array_generic(Array44d()));
    CALL_SUBTEST_4(array_generic(
        ArrayXXcf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
-    CALL_SUBTEST_7(array_generic(
+    CALL_SUBTEST_5(array_generic(
        ArrayXXf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
    CALL_SUBTEST_8(array_generic(
        ArrayXXi(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
--- a/test/exceptions.cpp
+++ b/test/exceptions.cpp
@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 // Various sanity tests with exceptions and non trivially copyable scalar type.
-//  - no memory leak when a custom scalar type trow an exceptions
+//  - no memory leak when a custom scalar type throw an exceptions
 //  - todo: complete the list of tests!

 #define EIGEN_STACK_ALLOCATION_LIMIT 100000000
@ -21,9 +21,8 @@
    AnnoyingScalar::countdown = 100;                                                                                  \
    int before = AnnoyingScalar::instances;                                                                           \
    bool exception_thrown = false;                                                                                    \
-    try {                                                                                                             \
-      OP;                                                                                                             \
-    } catch (my_exception) {                                                                                          \
+    EIGEN_TRY { OP; }                                                                                                 \
+    EIGEN_CATCH(my_exception) {                                                                                       \
      exception_thrown = true;                                                                                        \
      VERIFY(AnnoyingScalar::instances == before && "memory leak detected in " && EIGEN_MAKESTRING(OP));              \
    }                                                                                                                 \
@ -35,7 +34,11 @@ EIGEN_DECLARE_TEST(exceptions) {
  typedef Eigen::Matrix<AnnoyingScalar, Dynamic, Dynamic> MatrixType;

  {
+#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
    AnnoyingScalar::dont_throw = false;
+#else
+    AnnoyingScalar::dont_throw = true;
+#endif
    int n = 50;
    VectorType v0(n), v1(n);
    MatrixType m0(n, n), m1(n, n), m2(n, n);
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@ -78,6 +78,19 @@ void quaternion(void) {
  VERIFY(ss.str() == "0i + 0j + 0k + 1");
 #endif

+  // Consistent handling of scalar first/last conventions regardless of Eigen's own coefficient layout
+  const Scalar w(a);
+  const Vector3 xyz(v0);
+  q1 = Quaternionx::FromCoeffsScalarFirst(w, xyz.x(), xyz.y(), xyz.z());
+  q2 = Quaternionx::FromCoeffsScalarLast(xyz.x(), xyz.y(), xyz.z(), w);
+  VERIFY_IS_EQUAL(q1, q2);
+
+  VERIFY_IS_EQUAL(q1.coeffsScalarFirst()[0], w);
+  VERIFY_IS_EQUAL(q1.coeffsScalarFirst()(seqN(1, 3)), xyz);
+
+  VERIFY_IS_EQUAL(q1.coeffsScalarLast()[3], w);
+  VERIFY_IS_EQUAL(q1.coeffsScalarLast()(seqN(0, 3)), xyz);
+
  // concatenation
  q1 *= q2;

--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@ -4,7 +4,7 @@
 #include <Eigen/Core>

 // Allow gpu** macros for generic tests.
-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+#include <Eigen/src/Core/util/GpuHipCudaDefines.inc>

 // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
 // doesn't allow std::tuple to compile for host code either. In these cases,
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@ -72,17 +72,16 @@ void test_conversion() {
  // NaNs and infinities.
  VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
  VERIFY(!(numext::isnan)(float(half(0.0f))));
+  VERIFY((numext::isfinite)(float(half(65504.0f))));
+  VERIFY((numext::isfinite)(float(half(0.0f))));
  VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
  VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
  VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
  VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));

-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
-#endif
+  VERIFY((numext::isnan)(float(NumTraits<half>::quiet_NaN())));
+  VERIFY((numext::isinf)(float(NumTraits<half>::infinity())));
+  VERIFY((numext::isinf)(float(-NumTraits<half>::infinity())));

  // Exactly same checks as above, just directly on the half representation.
  VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
@ -92,12 +91,9 @@ void test_conversion() {
  VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
  VERIFY((numext::isnan)(half(__half_raw(0x7c01))));

-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(half(0.0 / 0.0)));
-  VERIFY((numext::isinf)(half(1.0 / 0.0)));
-  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
-#endif
+  VERIFY((numext::isnan)(NumTraits<half>::quiet_NaN()));
+  VERIFY((numext::isinf)(NumTraits<half>::infinity()));
+  VERIFY((numext::isinf)(-NumTraits<half>::infinity()));

  // Conversion to bool
  VERIFY(!static_cast<bool>(half(0.0)));
@ -204,19 +200,25 @@ void test_comparison() {
  VERIFY(half(1.0f) != half(2.0f));

  // Comparisons with NaNs and infinities.
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
-  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
+  VERIFY(!(NumTraits<half>::quiet_NaN() == NumTraits<half>::quiet_NaN()));
+  VERIFY(NumTraits<half>::quiet_NaN() != NumTraits<half>::quiet_NaN());

-  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
-  VERIFY(half(1.0) != half(0.0 / 0.0));
+  VERIFY(!(internal::random<half>() == NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() < NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() > NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() <= NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() >= NumTraits<half>::quiet_NaN()));
+  VERIFY(internal::random<half>() != NumTraits<half>::quiet_NaN());

-  VERIFY(half(1.0) < half(1.0 / 0.0));
-  VERIFY(half(1.0) > half(-1.0 / 0.0));
-#endif
+  VERIFY(!(NumTraits<half>::quiet_NaN() == internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() < internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() > internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() <= internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() >= internal::random<half>()));
+  VERIFY(NumTraits<half>::quiet_NaN() != internal::random<half>());
+
+  VERIFY(internal::random<half>() < NumTraits<half>::infinity());
+  VERIFY(internal::random<half>() > -NumTraits<half>::infinity());
 }

 void test_basic_functions() {
--- a/test/main.h
+++ b/test/main.h
@ -343,7 +343,7 @@ static std::vector<std::string> eigen_assert_list;
 #if !defined(EIGEN_TESTING_CONSTEXPR) && !defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
 #define EIGEN_INTERNAL_DEBUGGING
 #endif
-#include <Eigen/QR>  // required for createRandomPIMatrixOfRank and generateRandomMatrixSvs
+#include <Eigen/Core>

 inline void verify_impl(bool condition, const char* testname, const char* file, int line,
                        const char* condition_as_string) {
@ -935,3 +935,7 @@ int main(int argc, char* argv[]) {
 #endif

 #include "gpu_test_helper.h"
+
+#ifndef EIGEN_TEST_MAX_SIZE
+#define EIGEN_TEST_MAX_SIZE 320
+#endif
--- a/test/maxsizevector.cpp
+++ b/test/maxsizevector.cpp
@ -1,6 +1,8 @@
 #include "main.h"

+#ifdef EIGEN_EXCEPTIONS
 #include <exception>  // std::exception
+#endif

 #include <Eigen/src/Core/util/MaxSizeVector.h>

@ -31,28 +33,27 @@ struct Foo {
    std::cout << '~';
    --Foo::object_count;
  }
-
+#ifdef EIGEN_EXCEPTIONS
  class Fail : public std::exception {};
+#endif
 };

 Index Foo::object_count = 0;
 Index Foo::object_limit = 0;

-EIGEN_DECLARE_TEST(cxx11_maxsizevector) {
+EIGEN_DECLARE_TEST(maxsizevector) {
  typedef MaxSizeVector<Foo> VectorX;
  Foo::object_count = 0;
  for (int r = 0; r < g_repeat; r++) {
    Index rows = internal::random<Index>(3, 30);
    Foo::object_limit = internal::random<Index>(0, rows - 2);
    std::cout << "object_limit = " << Foo::object_limit << std::endl;
-    bool exception_raised = false;
 #ifdef EIGEN_EXCEPTIONS
+    bool exception_raised = false;
    try {
-#endif
      std::cout << "\nVectorX m(" << rows << ");\n";
      VectorX vect(rows);
      for (int i = 0; i < rows; ++i) vect.push_back(Foo());
-#ifdef EIGEN_EXCEPTIONS
      VERIFY(false);  // not reached if exceptions are enabled
    } catch (const Foo::Fail&) {
      exception_raised = true;
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -354,28 +354,28 @@ void packetmath_boolean_mask_ops() {
  for (int i = 0; i < size; ++i) {
    data1[i] = internal::random<Scalar>();
  }
-  CHECK_CWISE1(internal::ptrue, internal::ptrue);
+  CHECK_CWISE1_MASK(internal::ptrue, internal::ptrue);
  CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = Scalar(RealScalar(i));
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }

-  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);

  // Test (-0) == (0) for signed operations
  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = Scalar(-0.0);
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }
-  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);

  // Test NaN
  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = NumTraits<Scalar>::quiet_NaN();
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }
-  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+  CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
 }

 template <typename Scalar, typename Packet>
@ -384,28 +384,27 @@ void packetmath_boolean_mask_ops_real() {
  const int size = 2 * PacketSize;
  EIGEN_ALIGN_MAX Scalar data1[size];
  EIGEN_ALIGN_MAX Scalar data2[size];
-  EIGEN_ALIGN_MAX Scalar ref[size];

  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = internal::random<Scalar>();
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }

-  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+  CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);

  // Test (-0) <=/< (0) for signed operations
  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = Scalar(-0.0);
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }
-  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+  CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);

  // Test NaN
  for (int i = 0; i < PacketSize; ++i) {
    data1[i] = NumTraits<Scalar>::quiet_NaN();
    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
  }
-  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+  CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
 }

 template <typename Scalar, typename Packet, typename EnableIf = void>
@ -422,31 +421,30 @@ struct packetmath_boolean_mask_ops_notcomplex_test<
    const int size = 2 * PacketSize;
    EIGEN_ALIGN_MAX Scalar data1[size];
    EIGEN_ALIGN_MAX Scalar data2[size];
-    EIGEN_ALIGN_MAX Scalar ref[size];

    for (int i = 0; i < PacketSize; ++i) {
      data1[i] = internal::random<Scalar>();
      data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
    }

-    CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
-    CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+    CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
+    CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);

    // Test (-0) <=/< (0) for signed operations
    for (int i = 0; i < PacketSize; ++i) {
      data1[i] = Scalar(-0.0);
      data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
    }
-    CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
-    CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+    CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
+    CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);

    // Test NaN
    for (int i = 0; i < PacketSize; ++i) {
      data1[i] = NumTraits<Scalar>::quiet_NaN();
      data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
    }
-    CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
-    CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+    CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
+    CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
  }
 };

@ -700,11 +698,12 @@ void packetmath() {
    for (int i = 0; i < PacketSize; ++i) {
      data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit);
    }
-  } else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex) {
+  } else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex && !std::is_same<Scalar, bool>::value) {
    // Prevent very small product results by adjusting range.  Otherwise,
    // we may end up with multiplying e.g. 32 Eigen::halfs with values < 1.
    for (int i = 0; i < PacketSize; ++i) {
-      data1[i] = internal::random<Scalar>(Scalar(0.5), Scalar(1)) * (internal::random<bool>() ? Scalar(-1) : Scalar(1));
+      data1[i] = REF_MUL(internal::random<Scalar>(Scalar(0.5), Scalar(1)),
+                         (internal::random<bool>() ? Scalar(-1) : Scalar(1)));
    }
  }
  ref[0] = Scalar(1);
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@ -115,6 +115,30 @@ bool areApprox(const Scalar* a, const Scalar* b, int size, const typename NumTra
    VERIFY(test::areApprox(ref, data2, PacketSize) && #POP);       \
  }

+#define CHECK_CWISE1_MASK(REFOP, POP)                                \
+  {                                                                  \
+    bool ref_mask[PacketSize] = {};                                  \
+    bool data_mask[PacketSize] = {};                                 \
+    internal::pstore(data2, POP(internal::pload<Packet>(data1)));    \
+    for (int i = 0; i < PacketSize; ++i) {                           \
+      ref_mask[i] = numext::is_exactly_zero(REFOP(data1[i]));        \
+      data_mask[i] = numext::is_exactly_zero(data2[i]);              \
+    }                                                                \
+    VERIFY(test::areEqual(ref_mask, data_mask, PacketSize) && #POP); \
+  }
+
+#define CHECK_CWISE2_MASK(REFOP, POP)                                                                          \
+  {                                                                                                            \
+    bool ref_mask[PacketSize] = {};                                                                            \
+    bool data_mask[PacketSize] = {};                                                                           \
+    internal::pstore(data2, POP(internal::pload<Packet>(data1), internal::pload<Packet>(data1 + PacketSize))); \
+    for (int i = 0; i < PacketSize; ++i) {                                                                     \
+      ref_mask[i] = numext::is_exactly_zero(REFOP(data1[i], data1[i + PacketSize]));                           \
+      data_mask[i] = numext::is_exactly_zero(data2[i]);                                                        \
+    }                                                                                                          \
+    VERIFY(test::areEqual(ref_mask, data_mask, PacketSize) && #POP);                                           \
+  }
+
 // Checks component-wise for input of size N. All of data1, data2, and ref
 // should have size at least ceil(N/PacketSize)*PacketSize to avoid memory
 // access errors.
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@ -57,6 +57,10 @@ void product_selfadjoint(const MatrixType& m) {
        v1.tail(rows - 1) * v2.head(cols - 1).adjoint() + v2.head(cols - 1) * v1.tail(rows - 1).adjoint();
    VERIFY_IS_APPROX(m2, m3.template triangularView<Lower>().toDenseMatrix());
  }
+
+  // matrix-vector
+  m2 = m1.template triangularView<Lower>();
+  VERIFY_IS_APPROX(m1 * m4, m2.template selfadjointView<Lower>() * m4);
 }

 EIGEN_DECLARE_TEST(product_selfadjoint) {
--- a/test/redux.cpp
+++ b/test/redux.cpp
@ -37,12 +37,9 @@ void matrixRedux(const MatrixType& m) {
    m2.array() = m2.array() - kMaxVal * (m2.array() / kMaxVal);
  }

-  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
-  VERIFY_IS_APPROX(
-      MatrixType::Ones(rows, cols).sum(),
-      Scalar(float(
-          rows *
-          cols)));  // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
+  VERIFY_IS_EQUAL(MatrixType::Zero(rows, cols).sum(), Scalar(0));
+  Scalar sizeAsScalar = internal::cast<Index, Scalar>(rows * cols);
+  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), sizeAsScalar);
  Scalar s(0), p(1), minc(numext::real(m1.coeff(0))), maxc(numext::real(m1.coeff(0)));
  for (int j = 0; j < cols; j++)
    for (int i = 0; i < rows; i++) {
@ -160,6 +157,10 @@ EIGEN_DECLARE_TEST(redux) {
  int maxsize = (std::min)(100, EIGEN_TEST_MAX_SIZE);
  TEST_SET_BUT_UNUSED_VARIABLE(maxsize);
  for (int i = 0; i < g_repeat; i++) {
+    int rows = internal::random<int>(1, maxsize);
+    int cols = internal::random<int>(1, maxsize);
+    EIGEN_UNUSED_VARIABLE(rows);
+    EIGEN_UNUSED_VARIABLE(cols);
    CALL_SUBTEST_1(matrixRedux(Matrix<float, 1, 1>()));
    CALL_SUBTEST_1(matrixRedux(Array<float, 1, 1>()));
    CALL_SUBTEST_2(matrixRedux(Matrix2f()));
@ -168,19 +169,37 @@ EIGEN_DECLARE_TEST(redux) {
    CALL_SUBTEST_3(matrixRedux(Matrix4d()));
    CALL_SUBTEST_3(matrixRedux(Array4d()));
    CALL_SUBTEST_3(matrixRedux(Array44d()));
-    CALL_SUBTEST_4(matrixRedux(MatrixXcf(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_4(matrixRedux(ArrayXXcf(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_5(matrixRedux(MatrixXd(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_5(matrixRedux(ArrayXXd(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_6(matrixRedux(MatrixXi(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_6(matrixRedux(ArrayXXi(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
+    CALL_SUBTEST_4(matrixRedux(MatrixXf(rows, cols)));
+    CALL_SUBTEST_4(matrixRedux(ArrayXXf(rows, cols)));
+    CALL_SUBTEST_4(matrixRedux(MatrixXd(rows, cols)));
+    CALL_SUBTEST_4(matrixRedux(ArrayXXd(rows, cols)));
+    /* TODO: fix test for boolean */
+    /*CALL_SUBTEST_5(matrixRedux(MatrixX<bool>(rows, cols)));*/
+    /*CALL_SUBTEST_5(matrixRedux(ArrayXX<bool>(rows, cols)));*/
+    CALL_SUBTEST_5(matrixRedux(MatrixXi(rows, cols)));
+    CALL_SUBTEST_5(matrixRedux(ArrayXXi(rows, cols)));
+    CALL_SUBTEST_5(matrixRedux(MatrixX<int64_t>(rows, cols)));
+    CALL_SUBTEST_5(matrixRedux(ArrayXX<int64_t>(rows, cols)));
+    CALL_SUBTEST_6(matrixRedux(MatrixXcf(rows, cols)));
+    CALL_SUBTEST_6(matrixRedux(ArrayXXcf(rows, cols)));
+    CALL_SUBTEST_7(matrixRedux(MatrixXcd(rows, cols)));
+    CALL_SUBTEST_7(matrixRedux(ArrayXXcd(rows, cols)));
  }
  for (int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_7(vectorRedux(Vector4f()));
-    CALL_SUBTEST_7(vectorRedux(Array4f()));
-    CALL_SUBTEST_5(vectorRedux(VectorXd(internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_5(vectorRedux(ArrayXd(internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_8(vectorRedux(VectorXf(internal::random<int>(1, maxsize))));
-    CALL_SUBTEST_8(vectorRedux(ArrayXf(internal::random<int>(1, maxsize))));
+    int size = internal::random<int>(1, maxsize);
+    EIGEN_UNUSED_VARIABLE(size);
+    CALL_SUBTEST_8(vectorRedux(Vector4f()));
+    CALL_SUBTEST_8(vectorRedux(Array4f()));
+    CALL_SUBTEST_9(vectorRedux(VectorXf(size)));
+    CALL_SUBTEST_9(vectorRedux(ArrayXf(size)));
+    CALL_SUBTEST_10(vectorRedux(VectorXd(size)));
+    CALL_SUBTEST_10(vectorRedux(ArrayXd(size)));
+    /* TODO: fix test for boolean */
+    /*CALL_SUBTEST_10(vectorRedux(VectorX<bool>(size)));*/
+    /*CALL_SUBTEST_10(vectorRedux(ArrayX<bool>(size)));*/
+    CALL_SUBTEST_10(vectorRedux(VectorXi(size)));
+    CALL_SUBTEST_10(vectorRedux(ArrayXi(size)));
+    CALL_SUBTEST_10(vectorRedux(VectorX<int64_t>(size)));
+    CALL_SUBTEST_10(vectorRedux(ArrayX<int64_t>(size)));
  }
 }
--- a/test/sizeoverflow.cpp
+++ b/test/sizeoverflow.cpp
@ -9,6 +9,7 @@

 #include "main.h"

+#ifdef EIGEN_EXCEPTIONS
 #define VERIFY_THROWS_BADALLOC(a)                         \
  {                                                       \
    bool threw = false;                                   \
@ -19,6 +20,10 @@
    }                                                     \
    VERIFY(threw && "should have thrown bad_alloc: " #a); \
  }
+#else
+// No way to catch a bad alloc - program terminates.
+#define VERIFY_THROWS_BADALLOC(a)
+#endif

 template <typename MatrixType>
 void triggerMatrixBadAlloc(Index rows, Index cols) {
--- a/test/svd_common.h
+++ b/test/svd_common.h
@ -381,6 +381,7 @@ void svd_verify_assert_full_only(const MatrixType& input = MatrixType()) {

  typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
  RhsType rhs = RhsType::Zero(input.rows());
+  EIGEN_UNUSED_VARIABLE(rhs);  // Only used if asserts are enabled.
  MatrixType m(input.rows(), input.cols());
  svd_fill_random(m);

@ -410,6 +411,7 @@ void svd_verify_assert(const MatrixType& input = MatrixType()) {
  enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime };
  typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
  RhsType rhs = RhsType::Zero(input.rows());
+  EIGEN_UNUSED_VARIABLE(rhs);  // Only used if asserts are enabled.
  MatrixType m(input.rows(), input.cols());
  svd_fill_random(m);

--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@ -214,6 +214,17 @@ void vectorwiseop_matrix(const MatrixType& m) {
  VERIFY_IS_EQUAL(m1.real().middleCols(0, fix<0>).colwise().maxCoeff().eval().cols(), 0);
 }

+void vectorwiseop_mixedscalar() {
+  Matrix4cd a = Matrix4cd::Random();
+  Vector4cd b = Vector4cd::Random();
+  b.imag().setZero();
+  Vector4d b_real = b.real();
+
+  Matrix4cd c = a.array().rowwise() * b.array().transpose();
+  Matrix4cd d = a.array().rowwise() * b_real.array().transpose();
+  VERIFY_IS_CWISE_EQUAL(c, d);
+}
+
 EIGEN_DECLARE_TEST(vectorwiseop) {
  CALL_SUBTEST_1(vectorwiseop_array(Array22cd()));
  CALL_SUBTEST_2(vectorwiseop_array(Array<double, 3, 2>()));
@ -226,4 +237,5 @@ EIGEN_DECLARE_TEST(vectorwiseop) {
      MatrixXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
  CALL_SUBTEST_7(vectorwiseop_matrix(VectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
  CALL_SUBTEST_7(vectorwiseop_matrix(RowVectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
+  CALL_SUBTEST_8(vectorwiseop_mixedscalar());
 }
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@ -10,19 +10,11 @@
 #include "main.h"

 template <typename MatrixType>
-void matrixVisitor(const MatrixType& p) {
+void matrixVisitor_impl(MatrixType& m) {
  typedef typename MatrixType::Scalar Scalar;

-  Index rows = p.rows();
-  Index cols = p.cols();
-
-  // construct a random matrix where all coefficients are different
-  MatrixType m;
-  m = MatrixType::Random(rows, cols);
-  for (Index i = 0; i < m.size(); i++)
-    for (Index i2 = 0; i2 < i; i2++)
-      while (numext::equal_strict(m(i), m(i2)))  // yes, strict equality
-        m(i) = internal::random<Scalar>();
+  Index rows = m.rows();
+  Index cols = m.cols();

  Scalar minc = Scalar(1000), maxc = Scalar(-1000);
  Index minrow = 0, mincol = 0, maxrow = 0, maxcol = 0;
@ -119,6 +111,22 @@ void matrixVisitor(const MatrixType& p) {
    VERIFY((numext::isnan)(eigen_maxc));
  }
 }
+template <typename MatrixType>
+void matrixVisitor(const MatrixType& p) {
+  MatrixType m(p.rows(), p.cols());
+  // construct a random matrix where all coefficients are different
+  m.setRandom();
+  for (Index i = 0; i < m.size(); i++)
+    for (Index i2 = 0; i2 < i; i2++)
+      while (numext::equal_strict(m(i), m(i2)))  // yes, strict equality
+        m(i) = internal::random<typename DenseBase<MatrixType>::Scalar>();
+  MatrixType n = m;
+  matrixVisitor_impl(m);
+  // force outer-inner access pattern
+  using BlockType = Block<MatrixType, Dynamic, Dynamic>;
+  BlockType m_block = n.block(0, 0, n.rows(), n.cols());
+  matrixVisitor_impl(m_block);
+}

 template <typename VectorType>
 void vectorVisitor(const VectorType& w) {
--- a/test/zerosized.cpp
+++ b/test/zerosized.cpp
@ -24,6 +24,8 @@ void zeroReduction(const MatrixType& m) {
  VERIFY_RAISES_ASSERT(m.minCoeff());
  VERIFY_RAISES_ASSERT(m.maxCoeff());
  Index i, j;
+  EIGEN_UNUSED_VARIABLE(i);  // Only used if exceptions are enabled.
+  EIGEN_UNUSED_VARIABLE(j);
  VERIFY_RAISES_ASSERT(m.minCoeff(&i, &j));
  VERIFY_RAISES_ASSERT(m.maxCoeff(&i, &j));
  VERIFY_RAISES_ASSERT(m.reshaped().minCoeff(&i));
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -45,7 +45,7 @@
 #include <thread>

 #if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "ThreadPool"
+#include "../../../Eigen/ThreadPool"
 #endif

 #ifdef EIGEN_USE_GPU
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -10,14 +10,11 @@
 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

-// This header file container defines fo gpu* macros which will resolve to
-// their equivalent hip* or cuda* versions depending on the compiler in use
-// A separate header (included at the end of this file) will undefine all
-#include "TensorGpuHipCudaDefines.h"
-
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

+#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
+
 namespace Eigen {

 static const int kGpuScratchSize = 1024;
@ -390,6 +387,6 @@ static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig co
 }  // end namespace Eigen

 // undefine all the gpu* macros we defined at the beginning of the file
-#include "TensorGpuHipCudaUndefines.h"
+#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@ -37,12 +37,13 @@
 * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
 * - MKL (https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html) : fastest, free -- may be
 * incompatible with Eigen in GPL form.
- * - pocketfft (https://gitlab.mpcdf.mpg.de/mtr/pocketfft) : faster than kissfft, BSD 3-clause.
+ * - PocketFFT/DUCC (https://gitlab.mpcdf.mpg.de/mtr/pocketfft, https://gitlab.mpcdf.mpg.de/mtr/ducc) : faster than kissfft, BSD 3-clause.
 *   It is a heavily modified implementation of FFTPack, with the following advantages:
 *   1.strictly C++11 compliant
 *   2.more accurate twiddle factor computation
 *   3.very fast plan generation
 *   4.worst case complexity for transform sizes with large prime factors is N*log(N), because Bluestein's algorithm is
+ *   According to the author, DUCC contains the "evolution" of pocketfft, though the interface is very similar.
 * used for these cases
 *
 * \section FFTDesign Design
@ -85,7 +86,7 @@
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #include <fftw3.h>
-#include "src/FFT/ei_fftw_impl.h"
+#include "src/FFT/fftw_impl.h"
 namespace Eigen {
 // template <typename T> typedef struct internal::fftw_impl  default_fft_impl; this does not work
 template <typename T>
@ -93,7 +94,7 @@ struct default_fft_impl : public internal::fftw_impl<T> {};
 }  // namespace Eigen
 #elif defined EIGEN_MKL_DEFAULT
 // intel Math Kernel Library: fastest, free -- may be incompatible with Eigen in GPL form
-#include "src/FFT/ei_imklfft_impl.h"
+#include "src/FFT/imklfft_impl.h"
 namespace Eigen {
 template <typename T>
 struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
@ -101,14 +102,24 @@ struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
 #elif defined EIGEN_POCKETFFT_DEFAULT
 // internal::pocketfft_impl: a heavily modified implementation of FFTPack, with many advantages.
 #include <pocketfft_hdronly.h>
-#include "src/FFT/ei_pocketfft_impl.h"
+#include "src/FFT/pocketfft_impl.h"
 namespace Eigen {
 template <typename T>
 struct default_fft_impl : public internal::pocketfft_impl<T> {};
 }  // namespace Eigen
+#elif defined EIGEN_DUCCFFT_DEFAULT
+#include <ducc0/fft/fft.h>
+#include <ducc0/infra/string_utils.h>
+#include <ducc0/fft/fft.h>
+#include <ducc0/fft/fftnd_impl.h>
+#include "src/FFT/duccfft_impl.h"
+namespace Eigen {
+template <typename T>
+struct default_fft_impl : public internal::duccfft_impl<T> {};
+}  // namespace Eigen
 #else
 // internal::kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
-#include "src/FFT/ei_kissfft_impl.h"
+#include "src/FFT/kissfft_impl.h"
 namespace Eigen {
 template <typename T>
 struct default_fft_impl : public internal::kissfft_impl<T> {};
@ -204,7 +215,8 @@ class FFT {

  inline void fwd(Complex* dst, const Complex* src, Index nfft) { m_impl.fwd(dst, src, static_cast<int>(nfft)); }

-#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
  inline void fwd2(Complex* dst, const Complex* src, int n0, int n1) { m_impl.fwd2(dst, src, n0, n1); }
 #endif

@ -366,7 +378,8 @@ class FFT {
    inv(&dst[0], &src[0], nfft);
  }

-#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
  inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
    m_impl.inv2(dst, src, n0, n1);
    if (HasFlag(Unscaled) == false) scale(dst, 1. / (n0 * n1), n0 * n1);
@ -385,7 +398,6 @@ class FFT {
      Matrix<T_Data, Dynamic, 1>::Map(x, nx) *= s;
    else
      Matrix<T_Data, Dynamic, 1>::MapAligned(x, nx) *= s;
-      // Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
 #endif
  }

--- a/unsupported/Eigen/src/FFT/duccfft_impl.h
+++ b/unsupported/Eigen/src/FFT/duccfft_impl.h
@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename _Scalar>
+struct duccfft_impl {
+  using Scalar = _Scalar;
+  using Complex = std::complex<Scalar>;
+  using shape_t = ducc0::fmav_info::shape_t;
+  using stride_t = ducc0::fmav_info::stride_t;
+
+  inline void clear() {}
+
+  inline void fwd(Complex* dst, const Scalar* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Scalar> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft) / 2 + 1});
+    ducc0::r2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void fwd(Complex* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv(Scalar* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft) / 2 + 1});
+    ducc0::vfmav<Scalar> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2r(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv(Complex* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t axes{0, 1};
+    const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
+    ducc0::cfmav<Complex> m_in(src, in_shape, stride);
+    ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t axes{0, 1};
+    const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
+    ducc0::cfmav<Complex> m_in(src, in_shape, stride);
+    ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
--- a/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
--- a/unsupported/Eigen/src/FFT/ei_imklfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_imklfft_impl.h
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
--- a/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
@ -5,17 +5,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-using namespace pocketfft;
-using namespace pocketfft::detail;
-
 namespace Eigen {

 namespace internal {

 template <typename _Scalar>
 struct pocketfft_impl {
-  typedef _Scalar Scalar;
-  typedef std::complex<Scalar> Complex;
+  using Scalar = _Scalar;
+  using Complex = std::complex<Scalar>;
+  using shape_t = pocketfft::shape_t;
+  using stride_t = pocketfft::stride_t;

  inline void clear() {}

@ -24,14 +23,14 @@ struct pocketfft_impl {
    const shape_t axes_{0};
    const stride_t stride_in{sizeof(Scalar)};
    const stride_t stride_out{sizeof(Complex)};
-    r2c(shape_, stride_in, stride_out, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::r2c(shape_, stride_in, stride_out, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
  }

  inline void fwd(Complex* dst, const Complex* src, int nfft) {
    const shape_t shape_{static_cast<size_t>(nfft)};
    const shape_t axes_{0};
    const stride_t stride_{sizeof(Complex)};
-    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
  }

  inline void inv(Scalar* dst, const Complex* src, int nfft) {
@ -39,28 +38,28 @@ struct pocketfft_impl {
    const shape_t axes_{0};
    const stride_t stride_in{sizeof(Complex)};
    const stride_t stride_out{sizeof(Scalar)};
-    c2r(shape_, stride_in, stride_out, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::c2r(shape_, stride_in, stride_out, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
  }

  inline void inv(Complex* dst, const Complex* src, int nfft) {
    const shape_t shape_{static_cast<size_t>(nfft)};
    const shape_t axes_{0};
    const stride_t stride_{sizeof(Complex)};
-    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
  }

  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
    const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
    const shape_t axes_{0, 1};
    const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
-    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
  }

  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
    const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
    const shape_t axes_{0, 1};
    const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
-    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
  }
 };

--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@ -284,12 +284,13 @@ template <typename MatrixType>
 struct matrix_exp_computeUV<MatrixType, long double> {
  template <typename ArgType>
  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) {
-    using Scalar = typename traits<MatrixType>::Scalar;
 #if LDBL_MANT_DIG == 53  // double precision
    matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);

 #else

+    using Scalar = typename traits<MatrixType>::Scalar;
+
    using std::frexp;
    using std::pow;
    const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@ -1455,7 +1455,7 @@ struct zeta_impl {

    if (q <= zero) {
      if (q == numext::floor(q)) {
-        if (x == numext::floor(x) && long(x) % 2 == 0) {
+        if (numext::rint(Scalar(0.5) * x) == Scalar(0.5) * x) {
          return maxnum;
        } else {
          return nan;
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -88,6 +88,25 @@ else()
  ei_add_property(EIGEN_MISSING_BACKENDS "pocketfft, ")
 endif()

+if( NOT DUCC_ROOT AND ENV{DUCC_ROOT} )
+  set( DUCC_ROOT $ENV{DUCC_ROOT} )
+endif()
+find_path(DUCCFFT
+  NAMES "src/ducc0/fft/fft.h"
+  PATHS ${DUCC_ROOT})
+message(INFO " ${DUCC_ROOT} ${DUCCFFT}")
+if(DUCCFFT)
+  ei_add_property(EIGEN_TESTED_BACKENDS "duccfft, ")
+  include_directories( "${DUCCFFT}/src" )
+  add_library(ducc_lib "${DUCCFFT}/src/ducc0/infra/string_utils.cc" "${DUCCFFT}/src/ducc0/infra/threading.cc")
+  target_compile_definitions(ducc_lib PUBLIC "DUCC0_NO_THREADING=1")
+  ei_add_test(duccfft "-DEIGEN_DUCCFFT_DEFAULT -DDUCC0_NO_THREADING=1" "ducc_lib" )
+  set_target_properties(ducc_lib duccfft PROPERTIES CXX_STANDARD 17)
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "duccfft, ")
+endif()
+
+
 option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
 if(EIGEN_TEST_OPENGL)
  find_package(OpenGL)
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@ -14,8 +14,6 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 using Eigen::Tensor;

 template <int Layout>
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@ -17,8 +17,6 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@ -17,8 +17,6 @@
 #include "OffByOneScalar.h"
 #include <unsupported/Eigen/CXX11/Tensor>

-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 using Eigen::RowMajor;
 using Eigen::Tensor;

--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@ -15,8 +15,6 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 using Eigen::Tensor;

 void test_gpu_nullary() {
--- a/unsupported/test/cxx11_tensor_random_gpu.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@ -16,8 +16,6 @@
 #include "main.h"
 #include <Eigen/CXX11/Tensor>

-#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 void test_gpu_random_uniform() {
  Tensor<float, 2> out(72, 97);
  out.setZero();
--- a/unsupported/test/cxx11_tensor_scan_gpu.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@ -16,8 +16,6 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

-#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

--- a/unsupported/test/duccfft.cpp
+++ b/unsupported/test/duccfft.cpp
@ -0,0 +1,4 @@
+#define EIGEN_DUCCFFT_DEFAULT 1
+#include <ducc0/fft/fft.h>         // Needs to be included before main.h
+#include <ducc0/fft/fftnd_impl.h>  // Same requirement
+#include "fft_test_shared.h"
--- a/unsupported/test/fft_test_shared.h
+++ b/unsupported/test/fft_test_shared.h
@ -272,7 +272,7 @@ EIGEN_DECLARE_TEST(FFTW) {
  CALL_SUBTEST(test_scalar<float>(2 * 3 * 4 * 5 * 7));
  CALL_SUBTEST(test_scalar<double>(2 * 3 * 4 * 5 * 7));

-#if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT
+#if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT
  CALL_SUBTEST(test_complex<long double>(32));
  CALL_SUBTEST(test_complex<long double>(256));
  CALL_SUBTEST(test_complex<long double>(3 * 8));
@ -294,13 +294,15 @@ EIGEN_DECLARE_TEST(FFTW) {
 // fail to build since Eigen limit the stack allocation size,too big here.
 // CALL_SUBTEST( ( test_complex2d<long double, 256, 256> () ) );
 #endif
-#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
  CALL_SUBTEST((test_complex2d<float, 24, 24>()));
  CALL_SUBTEST((test_complex2d<float, 60, 60>()));
  CALL_SUBTEST((test_complex2d<float, 24, 60>()));
  CALL_SUBTEST((test_complex2d<float, 60, 24>()));
 #endif
-#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
  CALL_SUBTEST((test_complex2d<double, 24, 24>()));
  CALL_SUBTEST((test_complex2d<double, 60, 60>()));
  CALL_SUBTEST((test_complex2d<double, 24, 60>()));