Merge with master.

This commit is contained in:
Chip Kerchner 2025-07-10 12:22:28 -04:00
commit 8328eec90d
81 changed files with 5855 additions and 2190 deletions

2019
CHANGELOG.md Normal file

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,11 @@ if (POLICY CMP0146)
cmake_policy(SET CMP0146 OLD)
endif ()
# Normalize DESTINATION paths
if (POLICY CMP0177)
cmake_policy(SET CMP0177 NEW)
endif ()
#==============================================================================
# CMake Project.
#==============================================================================
@ -254,7 +259,7 @@ if(EIGEN_BUILD_CMAKE_PACKAGE)
DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
# Add uninstall target
if(NOT TARGET uninstall)
if(NOT TARGET uninstall AND PROJECT_IS_TOP_LEVEL)
add_custom_target ( uninstall
COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
endif()

View File

@ -192,45 +192,38 @@ using std::ptrdiff_t;
#include "src/Core/arch/Default/BFloat16.h"
#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
#if defined EIGEN_VECTORIZE_AVX512
#if defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/Reductions.h"
#include "src/Core/arch/SSE/Complex.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#endif
#if defined EIGEN_VECTORIZE_AVX
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/Reductions.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/AVX/MathFunctions.h"
#endif
#if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/AVX512/PacketMath.h"
#include "src/Core/arch/AVX512/Reductions.h"
#include "src/Core/arch/AVX512/Complex.h"
#include "src/Core/arch/AVX512/TypeCasting.h"
#include "src/Core/arch/AVX512/MathFunctions.h"
#include "src/Core/arch/AVX512/TrsmKernel.h"
#endif
#if defined EIGEN_VECTORIZE_AVX512FP16
#include "src/Core/arch/AVX512/PacketMathFP16.h"
#endif
#include "src/Core/arch/SSE/TypeCasting.h"
#include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/AVX512/TypeCasting.h"
#if defined EIGEN_VECTORIZE_AVX512FP16
#include "src/Core/arch/AVX512/TypeCastingFP16.h"
#endif
#include "src/Core/arch/SSE/Complex.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX512/Complex.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX512/MathFunctions.h"
#if defined EIGEN_VECTORIZE_AVX512FP16
#include "src/Core/arch/AVX512/MathFunctionsFP16.h"
#endif
#include "src/Core/arch/AVX512/TrsmKernel.h"
#elif defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#include "src/Core/arch/SSE/Complex.h"
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/AVX/MathFunctions.h"
#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/SSE/Complex.h"
#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
#include "src/Core/arch/AltiVec/PacketMath.h"
#include "src/Core/arch/AltiVec/TypeCasting.h"
#include "src/Core/arch/AltiVec/MathFunctions.h"
@ -358,6 +351,7 @@ using std::ptrdiff_t;
#include "src/Core/SkewSymmetricMatrix3.h"
#include "src/Core/Redux.h"
#include "src/Core/Visitor.h"
#include "src/Core/FindCoeff.h"
#include "src/Core/Fuzzy.h"
#include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h"

View File

@ -726,6 +726,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
Index count) const {
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<PacketType, NumPackets> packets;
for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
Index offset = begin / SrcPacketSize;
Index actualBegin = begin % SrcPacketSize;
for (; offset < NumPackets; offset++) {
@ -743,6 +744,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
Index count) const {
constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
PacketBlock<PacketType, NumPackets> packets;
for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
Index offset = begin / SrcPacketSize;
Index actualBegin = begin % SrcPacketSize;
for (; offset < NumPackets; offset++) {

View File

@ -45,10 +45,16 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
// - This is the return type of the coeff() method.
// - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
// to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
// - The DirectAccessBit means exactly that the underlying data of coefficients can be directly accessed as a plain
// strided array, which means exactly that the underlying data of coefficients does exist in memory, which means
// exactly that the coefficients is const-referencable, which means exactly that we can have coeff() return a const
// reference. For example, Map<const Matrix> have DirectAccessBit but not LvalueBit, so that Map<const Matrix>.coeff()
// does points to a const Scalar& which exists in memory, while does not allow coeffRef() as it would not provide a
// lvalue. Notice that DirectAccessBit and LvalueBit are mutually orthogonal.
// - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems
// while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
// not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
typedef std::conditional_t<bool(internal::traits<Derived>::Flags& LvalueBit), const Scalar&,
typedef std::conditional_t<bool(internal::traits<Derived>::Flags&(LvalueBit | DirectAccessBit)), const Scalar&,
std::conditional_t<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>>
CoeffReturnType;

464
Eigen/src/Core/FindCoeff.h Normal file
View File

@ -0,0 +1,464 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_FIND_COEFF_H
#define EIGEN_FIND_COEFF_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
struct max_coeff_functor {
EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
return candidate > incumbent;
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
return pcmp_lt(incumbent, candidate);
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_max(a);
}
};
template <typename Scalar>
struct max_coeff_functor<Scalar, PropagateNaN, false> {
EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_max<PropagateNaN>(a);
}
};
template <typename Scalar>
struct max_coeff_functor<Scalar, PropagateNumbers, false> {
EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
return (candidate > incumbent) || ((candidate == candidate) && (incumbent != incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(candidate));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_max<PropagateNumbers>(a);
}
};
template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
struct min_coeff_functor {
EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
return candidate < incumbent;
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
return pcmp_lt(candidate, incumbent);
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_min(a);
}
};
template <typename Scalar>
struct min_coeff_functor<Scalar, PropagateNaN, false> {
EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_min<PropagateNaN>(a);
}
};
template <typename Scalar>
struct min_coeff_functor<Scalar, PropagateNumbers, false> {
EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
return (candidate < incumbent) || ((candidate == candidate) && (incumbent != incumbent));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(candidate));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
return predux_min<PropagateNumbers>(a);
}
};
template <typename Scalar>
struct min_max_traits {
static constexpr bool PacketAccess = packet_traits<Scalar>::Vectorizable;
};
template <typename Scalar, int NaNPropagation>
struct functor_traits<max_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
template <typename Scalar, int NaNPropagation>
struct functor_traits<min_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
template <typename Evaluator, typename Func, bool Linear, bool Vectorize>
struct find_coeff_loop;
template <typename Evaluator, typename Func>
struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ false> {
using Scalar = typename Evaluator::Scalar;
static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& outer, Index& inner) {
Index outerSize = eval.outerSize();
Index innerSize = eval.innerSize();
/* initialization performed in calling function */
/* result = eval.coeff(0, 0); */
/* outer = 0; */
/* inner = 0; */
for (Index j = 0; j < outerSize; j++) {
for (Index i = 0; i < innerSize; i++) {
Scalar xprCoeff = eval.coeffByOuterInner(j, i);
bool newRes = func.compareCoeff(res, xprCoeff);
if (newRes) {
outer = j;
inner = i;
res = xprCoeff;
}
}
}
}
};
template <typename Evaluator, typename Func>
struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ false> {
using Scalar = typename Evaluator::Scalar;
static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& index) {
Index size = eval.size();
/* initialization performed in calling function */
/* result = eval.coeff(0); */
/* index = 0; */
for (Index k = 0; k < size; k++) {
Scalar xprCoeff = eval.coeff(k);
bool newRes = func.compareCoeff(res, xprCoeff);
if (newRes) {
index = k;
res = xprCoeff;
}
}
}
};
template <typename Evaluator, typename Func>
struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ true> {
using ScalarImpl = find_coeff_loop<Evaluator, Func, false, false>;
using Scalar = typename Evaluator::Scalar;
using Packet = typename Evaluator::Packet;
static constexpr int PacketSize = unpacket_traits<Packet>::size;
static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& outer,
Index& inner) {
Index outerSize = eval.outerSize();
Index innerSize = eval.innerSize();
Index packetEnd = numext::round_down(innerSize, PacketSize);
/* initialization performed in calling function */
/* result = eval.coeff(0, 0); */
/* outer = 0; */
/* inner = 0; */
bool checkPacket = false;
for (Index j = 0; j < outerSize; j++) {
Packet resultPacket = pset1<Packet>(result);
for (Index i = 0; i < packetEnd; i += PacketSize) {
Packet xprPacket = eval.template packetByOuterInner<Unaligned, Packet>(j, i);
if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
outer = j;
inner = i;
result = func.predux(xprPacket);
resultPacket = pset1<Packet>(result);
checkPacket = true;
}
}
for (Index i = packetEnd; i < innerSize; i++) {
Scalar xprCoeff = eval.coeffByOuterInner(j, i);
if (func.compareCoeff(result, xprCoeff)) {
outer = j;
inner = i;
result = xprCoeff;
checkPacket = false;
}
}
}
if (checkPacket) {
result = eval.coeffByOuterInner(outer, inner);
Index i_end = inner + PacketSize;
for (Index i = inner; i < i_end; i++) {
Scalar xprCoeff = eval.coeffByOuterInner(outer, i);
if (func.compareCoeff(result, xprCoeff)) {
inner = i;
result = xprCoeff;
}
}
}
}
};
template <typename Evaluator, typename Func>
struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ true> {
using ScalarImpl = find_coeff_loop<Evaluator, Func, true, false>;
using Scalar = typename Evaluator::Scalar;
using Packet = typename Evaluator::Packet;
static constexpr int PacketSize = unpacket_traits<Packet>::size;
static constexpr int Alignment = Evaluator::Alignment;
static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
Index size = eval.size();
Index packetEnd = numext::round_down(size, PacketSize);
/* initialization performed in calling function */
/* result = eval.coeff(0); */
/* index = 0; */
Packet resultPacket = pset1<Packet>(result);
bool checkPacket = false;
for (Index k = 0; k < packetEnd; k += PacketSize) {
Packet xprPacket = eval.template packet<Alignment, Packet>(k);
if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
index = k;
result = func.predux(xprPacket);
resultPacket = pset1<Packet>(result);
checkPacket = true;
}
}
for (Index k = packetEnd; k < size; k++) {
Scalar xprCoeff = eval.coeff(k);
if (func.compareCoeff(result, xprCoeff)) {
index = k;
result = xprCoeff;
checkPacket = false;
}
}
if (checkPacket) {
result = eval.coeff(index);
Index k_end = index + PacketSize;
for (Index k = index; k < k_end; k++) {
Scalar xprCoeff = eval.coeff(k);
if (func.compareCoeff(result, xprCoeff)) {
index = k;
result = xprCoeff;
}
}
}
}
};
template <typename Derived>
struct find_coeff_evaluator : public evaluator<Derived> {
using Base = evaluator<Derived>;
using Scalar = typename Derived::Scalar;
using Packet = typename packet_traits<Scalar>::type;
static constexpr int Flags = Base::Flags;
static constexpr bool IsRowMajor = bool(Flags & RowMajorBit);
EIGEN_DEVICE_FUNC inline find_coeff_evaluator(const Derived& xpr) : Base(xpr), m_xpr(xpr) {}
EIGEN_DEVICE_FUNC inline Scalar coeffByOuterInner(Index outer, Index inner) const {
Index row = IsRowMajor ? outer : inner;
Index col = IsRowMajor ? inner : outer;
return Base::coeff(row, col);
}
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC inline PacketType packetByOuterInner(Index outer, Index inner) const {
Index row = IsRowMajor ? outer : inner;
Index col = IsRowMajor ? inner : outer;
return Base::template packet<LoadMode, PacketType>(row, col);
}
EIGEN_DEVICE_FUNC inline Index innerSize() const { return m_xpr.innerSize(); }
EIGEN_DEVICE_FUNC inline Index outerSize() const { return m_xpr.outerSize(); }
EIGEN_DEVICE_FUNC inline Index size() const { return m_xpr.size(); }
const Derived& m_xpr;
};
template <typename Derived, typename Func>
struct find_coeff_impl {
using Evaluator = find_coeff_evaluator<Derived>;
static constexpr int Flags = Evaluator::Flags;
static constexpr int Alignment = Evaluator::Alignment;
static constexpr bool IsRowMajor = Derived::IsRowMajor;
static constexpr int MaxInnerSizeAtCompileTime =
IsRowMajor ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime;
static constexpr int MaxSizeAtCompileTime = Derived::MaxSizeAtCompileTime;
using Scalar = typename Derived::Scalar;
using Packet = typename Evaluator::Packet;
static constexpr int PacketSize = unpacket_traits<Packet>::size;
static constexpr bool Linearize = bool(Flags & LinearAccessBit);
static constexpr bool DontVectorize =
enum_lt_not_dynamic(Linearize ? MaxSizeAtCompileTime : MaxInnerSizeAtCompileTime, PacketSize);
static constexpr bool Vectorize =
!DontVectorize && bool(Flags & PacketAccessBit) && functor_traits<Func>::PacketAccess;
using Loop = find_coeff_loop<Evaluator, Func, Linearize, Vectorize>;
template <bool ForwardLinearAccess = Linearize, std::enable_if_t<!ForwardLinearAccess, bool> = true>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
Index& inner) {
Evaluator eval(xpr);
Loop::run(eval, func, res, outer, inner);
}
template <bool ForwardLinearAccess = Linearize, std::enable_if_t<ForwardLinearAccess, bool> = true>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
Index& inner) {
// where possible, use the linear loop and back-calculate the outer and inner indices
Index index = 0;
run(xpr, func, res, index);
outer = index / xpr.innerSize();
inner = index % xpr.innerSize();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& index) {
Evaluator eval(xpr);
Loop::run(eval, func, res, index);
}
};
template <typename Derived, typename IndexType, typename Func>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
IndexType* rowPtr, IndexType* colPtr) {
eigen_assert(mat.rows() > 0 && mat.cols() > 0 && "you are using an empty matrix");
using Scalar = typename DenseBase<Derived>::Scalar;
using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
Index outer = 0;
Index inner = 0;
Scalar res = mat.coeff(0, 0);
FindCoeffImpl::run(mat.derived(), func, res, outer, inner);
*rowPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? outer : inner);
if (colPtr) *colPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? inner : outer);
return res;
}
template <typename Derived, typename IndexType, typename Func>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
IndexType* indexPtr) {
eigen_assert(mat.size() > 0 && "you are using an empty matrix");
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
using Scalar = typename DenseBase<Derived>::Scalar;
using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
Index index = 0;
Scalar res = mat.coeff(0);
FindCoeffImpl::run(mat.derived(), func, res, index);
*indexPtr = internal::convert_index<IndexType>(index);
return res;
}
} // namespace internal
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
*
* If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowPtr,
IndexType* colPtr) const {
using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
Func func;
return internal::findCoeff(derived(), func, rowPtr, colPtr);
}
/** \returns the minimum of all coefficients of *this and puts in *index its location.
*
* If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
* DenseBase::minCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* indexPtr) const {
using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
Func func;
return internal::findCoeff(derived(), func, indexPtr);
}
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
*
* If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
IndexType* colPtr) const {
using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
Func func;
return internal::findCoeff(derived(), func, rowPtr, colPtr);
}
/** \returns the maximum of all coefficients of *this and puts in *index its location.
*
* If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
* DenseBase::maxCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* indexPtr) const {
using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
Func func;
return internal::findCoeff(derived(), func, indexPtr);
}
} // namespace Eigen
#endif // EIGEN_FIND_COEFF_H

View File

@ -375,7 +375,7 @@ EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
return a && b;
}
// In the generic case, memset to all one bits.
// In the generic packet case, memset to all one bits.
template <typename Packet, typename EnableIf = void>
struct ptrue_impl {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@ -385,19 +385,16 @@ struct ptrue_impl {
}
};
// Use a value of one for scalars.
template <typename Scalar>
struct ptrue_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&) { return Scalar(1); }
};
// For booleans, we can only directly set a valid `bool` value to avoid UB.
template <>
struct ptrue_impl<bool, void> {
static EIGEN_DEVICE_FUNC inline bool run(const bool& /*a*/) { return true; }
};
// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
// Although this is technically not a valid bitmask, the scalar path for pselect
// uses a comparison to zero, so this should still work in most cases. We don't
// have another option, since the scalar type requires initialization.
template <typename T>
struct ptrue_impl<T, std::enable_if_t<is_scalar<T>::value && NumTraits<T>::RequireInitialization>> {
static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { return T(1); }
static EIGEN_DEVICE_FUNC inline bool run(const bool&) { return true; }
};
/** \internal \returns one bits. */
@ -406,7 +403,7 @@ EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& a) {
return ptrue_impl<Packet>::run(a);
}
// In the general case, memset to zero.
// In the general packet case, memset to zero.
template <typename Packet, typename EnableIf = void>
struct pzero_impl {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@ -608,7 +605,7 @@ EIGEN_DEVICE_FUNC inline bool pselect<bool>(const bool& cond, const bool& a, con
/** \internal \returns the min or of \a a and \a b (coeff-wise)
If either \a a or \a b are NaN, the result is implementation defined. */
template <int NaNPropagation>
template <int NaNPropagation, bool IsInteger>
struct pminmax_impl {
template <typename Packet, typename Op>
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
@ -619,7 +616,7 @@ struct pminmax_impl {
/** \internal \returns the min or max of \a a and \a b (coeff-wise)
If either \a a or \a b are NaN, NaN is returned. */
template <>
struct pminmax_impl<PropagateNaN> {
struct pminmax_impl<PropagateNaN, false> {
template <typename Packet, typename Op>
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
Packet not_nan_mask_a = pcmp_eq(a, a);
@ -632,7 +629,7 @@ struct pminmax_impl<PropagateNaN> {
If both \a a and \a b are NaN, NaN is returned.
Equivalent to std::fmin(a, b). */
template <>
struct pminmax_impl<PropagateNumbers> {
struct pminmax_impl<PropagateNumbers, false> {
template <typename Packet, typename Op>
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
Packet not_nan_mask_a = pcmp_eq(a, a);
@ -654,7 +651,8 @@ EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
NaNPropagation determines the NaN propagation semantics. */
template <int NaNPropagation, typename Packet>
EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
}
/** \internal \returns the max of \a a and \a b (coeff-wise)
@ -668,7 +666,8 @@ EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
NaNPropagation determines the NaN propagation semantics. */
template <int NaNPropagation, typename Packet>
EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
}
/** \internal \returns the absolute value of \a a */
@ -873,17 +872,29 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_trait
return a;
}
template <typename Packet, typename EnableIf = void>
struct peven_mask_impl {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet&) {
typedef typename unpacket_traits<Packet>::type Scalar;
const size_t n = unpacket_traits<Packet>::size;
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
for (size_t i = 0; i < n; ++i) {
memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
}
return ploadu<Packet>(elements);
}
};
template <typename Scalar>
struct peven_mask_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar&) { return Scalar(1); }
};
/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
where x is the value of all 1-bits. */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& /*a*/) {
typedef typename unpacket_traits<Packet>::type Scalar;
const size_t n = unpacket_traits<Packet>::size;
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
for (size_t i = 0; i < n; ++i) {
memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
}
return ploadu<Packet>(elements);
EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& a) {
return peven_mask_impl<Packet>::run(a);
}
/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
@ -1244,26 +1255,46 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const
template <typename Packet>
EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<Scalar>)));
}
template <int NaNPropagation, typename Packet>
EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
}
/** \internal \returns the min of the elements of \a a */
/** \internal \returns the max of the elements of \a a */
template <typename Packet>
EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<Scalar>)));
}
template <int NaNPropagation, typename Packet>
struct predux_min_max_helper_impl {
using Scalar = typename unpacket_traits<Packet>::type;
static constexpr bool UsePredux_ = NaNPropagation == PropagateFast || NumTraits<Scalar>::IsInteger;
template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
}
template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
}
template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
return predux_min(a);
}
template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
return predux_max(a);
}
};
template <int NaNPropagation, typename Packet>
EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
return predux_min_max_helper_impl<NaNPropagation, Packet>::run_min(a);
}
template <int NaNPropagation, typename Packet>
EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
return predux_min_max_helper_impl<NaNPropagation, Packet>::run_max(a);
}
#undef EIGEN_BINARY_OP_NAN_PROPAGATION

View File

@ -182,10 +182,6 @@ struct imag_ref_retval {
typedef typename NumTraits<Scalar>::Real& type;
};
// implementation in MathFunctionsImpl.h
template <typename Mask, bool is_built_in_float = std::is_floating_point<Mask>::value>
struct scalar_select_mask;
} // namespace internal
namespace numext {
@ -211,9 +207,9 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar&
return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
}
template <typename Scalar, typename Mask>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Mask& mask, const Scalar& a, const Scalar& b) {
return internal::scalar_select_mask<Mask>::run(mask) ? b : a;
template <typename Scalar>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Scalar& mask, const Scalar& a, const Scalar& b) {
return numext::is_exactly_zero(mask) ? b : a;
}
} // namespace numext

View File

@ -256,48 +256,6 @@ EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
return ComplexT(numext::log(a), b);
}
// For generic scalars, use ternary select.
template <typename Mask>
struct scalar_select_mask<Mask, /*is_built_in_float*/ false> {
static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) { return numext::is_exactly_zero(mask); }
};
// For built-in float mask, bitcast the mask to its integer counterpart and use ternary select.
template <typename Mask>
struct scalar_select_mask<Mask, /*is_built_in_float*/ true> {
using IntegerType = typename numext::get_integer_by_size<sizeof(Mask)>::unsigned_type;
static EIGEN_DEVICE_FUNC inline bool run(const Mask& mask) {
return numext::is_exactly_zero(numext::bit_cast<IntegerType>(std::abs(mask)));
}
};
template <int Size = sizeof(long double)>
struct ldbl_select_mask {
static constexpr int MantissaDigits = std::numeric_limits<long double>::digits;
static constexpr int NumBytes = (MantissaDigits == 64 ? 80 : 128) / CHAR_BIT;
static EIGEN_DEVICE_FUNC inline bool run(const long double& mask) {
const uint8_t* mask_bytes = reinterpret_cast<const uint8_t*>(&mask);
for (Index i = 0; i < NumBytes; i++) {
if (mask_bytes[i] != 0) return false;
}
return true;
}
};
template <>
struct ldbl_select_mask<sizeof(double)> : scalar_select_mask<double> {};
template <>
struct scalar_select_mask<long double, true> : ldbl_select_mask<> {};
template <typename RealMask>
struct scalar_select_mask<std::complex<RealMask>, false> {
using impl = scalar_select_mask<RealMask>;
static EIGEN_DEVICE_FUNC inline bool run(const std::complex<RealMask>& mask) {
return impl::run(numext::real(mask)) && impl::run(numext::imag(mask));
}
};
} // end namespace internal
} // end namespace Eigen

View File

@ -851,7 +851,7 @@ struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
template <typename Dest>
static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::IsVectorAtCompileTime>::run(
selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
dst, lhs.nestedExpression(), rhs, alpha);
}
};
@ -863,7 +863,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
template <typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
selfadjoint_product_impl<Lhs, 0, Lhs::IsVectorAtCompileTime, typename Rhs::MatrixType, Rhs::Mode, false>::run(
selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
dst, lhs, rhs.nestedExpression(), alpha);
}
};

View File

@ -78,6 +78,14 @@ class SolverBase : public EigenBase<Derived> {
template <typename Derived_>
friend struct internal::solve_assertion;
ComputationInfo info() const {
// CRTP static dispatch: Calls the 'info()' method on the derived class.
// Derived must implement 'ComputationInfo info() const'.
// If not implemented, name lookup falls back to this base method, causing
// infinite recursion (detectable by -Winfinite-recursion).
return derived().info();
}
enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,

View File

@ -603,10 +603,9 @@ class VectorwiseOp {
/** Returns the expression where each subvector is the product of the vector \a other
* by the corresponding subvector of \c *this */
template <typename OtherDerived>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_product_op<Scalar>, const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type> EIGEN_DEVICE_FUNC
operator*(const DenseBase<OtherDerived>& other) const {
EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_product_op<Scalar, typename OtherDerived::Scalar>,
const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
operator*(const DenseBase<OtherDerived>& other) const {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
@ -616,8 +615,8 @@ class VectorwiseOp {
/** Returns the expression where each subvector is the quotient of the corresponding
* subvector of \c *this by the vector \a other */
template <typename OtherDerived>
EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type>
EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>,
const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
operator/(const DenseBase<OtherDerived>& other) const {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)

View File

@ -384,173 +384,6 @@ EIGEN_DEVICE_FUNC void DenseBase<Derived>::visit(Visitor& visitor) const {
namespace internal {
/** \internal
* \brief Base class to implement min and max visitors
*/
template <typename Derived>
struct coeff_visitor {
// default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
EIGEN_DEVICE_FUNC coeff_visitor() : row(-1), col(-1), res(0) {}
typedef typename Derived::Scalar Scalar;
Index row, col;
Scalar res;
EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index i, Index j) {
res = value;
row = i;
col = j;
}
};
template <typename Scalar, int NaNPropagation, bool is_min = true>
struct minmax_compare {
typedef typename packet_traits<Scalar>::type Packet;
static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a < b; }
static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_min<NaNPropagation>(p); }
};
template <typename Scalar, int NaNPropagation>
struct minmax_compare<Scalar, NaNPropagation, false> {
typedef typename packet_traits<Scalar>::type Packet;
static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a > b; }
static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_max<NaNPropagation>(p); }
};
// Default implementation used by non-floating types, where we do not
// need special logic for NaN handling.
template <typename Derived, bool is_min, int NaNPropagation,
bool isInt = NumTraits<typename Derived::Scalar>::IsInteger>
struct minmax_coeff_visitor : coeff_visitor<Derived> {
using Scalar = typename Derived::Scalar;
using Packet = typename packet_traits<Scalar>::type;
using Comparator = minmax_compare<Scalar, NaNPropagation, is_min>;
static constexpr Index PacketSize = packet_traits<Scalar>::size;
EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
if (Comparator::compare(value, this->res)) {
this->res = value;
this->row = i;
this->col = j;
}
}
EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
Scalar value = Comparator::predux(p);
if (Comparator::compare(value, this->res)) {
const Packet range = preverse(plset<Packet>(Scalar(1)));
Packet mask = pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
}
EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
Scalar value = Comparator::predux(p);
const Packet range = preverse(plset<Packet>(Scalar(1)));
Packet mask = pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
};
// Suppress NaN. The only case in which we return NaN is if the matrix is all NaN,
// in which case, row=0, col=0 is returned for the location.
template <typename Derived, bool is_min>
struct minmax_coeff_visitor<Derived, is_min, PropagateNumbers, false> : coeff_visitor<Derived> {
typedef typename Derived::Scalar Scalar;
using Packet = typename packet_traits<Scalar>::type;
using Comparator = minmax_compare<Scalar, PropagateNumbers, is_min>;
EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
this->res = value;
this->row = i;
this->col = j;
}
}
EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
const Index PacketSize = packet_traits<Scalar>::size;
Scalar value = Comparator::predux(p);
if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
const Packet range = preverse(plset<Packet>(Scalar(1)));
/* mask will be zero for NaNs, so they will be ignored. */
Packet mask = pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
}
EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
const Index PacketSize = packet_traits<Scalar>::size;
Scalar value = Comparator::predux(p);
if ((numext::isnan)(value)) {
this->res = value;
this->row = 0;
this->col = 0;
return;
}
const Packet range = preverse(plset<Packet>(Scalar(1)));
/* mask will be zero for NaNs, so they will be ignored. */
Packet mask = pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
};
// Propagate NaNs. If the matrix contains NaN, the location of the first NaN
// will be returned in row and col.
template <typename Derived, bool is_min, int NaNPropagation>
struct minmax_coeff_visitor<Derived, is_min, NaNPropagation, false> : coeff_visitor<Derived> {
typedef typename Derived::Scalar Scalar;
using Packet = typename packet_traits<Scalar>::type;
using Comparator = minmax_compare<Scalar, PropagateNaN, is_min>;
EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
const bool value_is_nan = (numext::isnan)(value);
if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
this->res = value;
this->row = i;
this->col = j;
}
}
EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
const Index PacketSize = packet_traits<Scalar>::size;
Scalar value = Comparator::predux(p);
const bool value_is_nan = (numext::isnan)(value);
if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
const Packet range = preverse(plset<Packet>(Scalar(1)));
// If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
}
EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
const Index PacketSize = packet_traits<Scalar>::size;
Scalar value = Comparator::predux(p);
const bool value_is_nan = (numext::isnan)(value);
const Packet range = preverse(plset<Packet>(Scalar(1)));
// If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
};
template <typename Derived, bool is_min, int NaNPropagation>
struct functor_traits<minmax_coeff_visitor<Derived, is_min, NaNPropagation>> {
using Scalar = typename Derived::Scalar;
enum { Cost = NumTraits<Scalar>::AddCost, LinearAccess = false, PacketAccess = packet_traits<Scalar>::HasCmp };
};
template <typename Scalar>
struct all_visitor {
using result_type = bool;
@ -643,100 +476,6 @@ struct all_finite_impl<Derived, false> {
} // end namespace internal
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowId,
IndexType* colId) const {
eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
this->visit(minVisitor);
*rowId = minVisitor.row;
if (colId) *colId = minVisitor.col;
return minVisitor.res;
}
/** \returns the minimum of all coefficients of *this and puts in *index its location.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
* DenseBase::minCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* index) const {
eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
this->visit(minVisitor);
*index = IndexType((RowsAtCompileTime == 1) ? minVisitor.col : minVisitor.row);
return minVisitor.res;
}
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
IndexType* colPtr) const {
eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
this->visit(maxVisitor);
*rowPtr = maxVisitor.row;
if (colPtr) *colPtr = maxVisitor.col;
return maxVisitor.res;
}
/** \returns the maximum of all coefficients of *this and puts in *index its location.
*
* In case \c *this contains NaN, NaNPropagation determines the behavior:
* NaNPropagation == PropagateFast : undefined
* NaNPropagation == PropagateNaN : result is NaN
* NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
* \warning the matrix must be not empty, otherwise an assertion is triggered.
*
* \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
* DenseBase::maxCoeff()
*/
template <typename Derived>
template <int NaNPropagation, typename IndexType>
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* index) const {
eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
this->visit(maxVisitor);
*index = (RowsAtCompileTime == 1) ? maxVisitor.col : maxVisitor.row;
return maxVisitor.res;
}
/** \returns true if all coefficients are true
*
* Example: \include MatrixBase_all.cpp

View File

@ -654,25 +654,6 @@ template <>
EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
}
template <>
EIGEN_STRONG_INLINE int64_t predux<Packet4l>(const Packet4l& a) {
__m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
return _mm_extract_epi64_0(r) + _mm_extract_epi64_1(r);
}
template <>
EIGEN_STRONG_INLINE uint64_t predux<Packet4ul>(const Packet4ul& a) {
__m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
return numext::bit_cast<uint64_t>(_mm_extract_epi64_0(r) + _mm_extract_epi64_1(r));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
}
#define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
@ -1955,23 +1936,6 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Pack
return pmul(a, c); // a * 2^e
}
template <>
EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) {
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
}
template <>
EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a) {
return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
}
template <>
EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
@ -1985,82 +1949,6 @@ EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a)
return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) {
Packet8f tmp;
tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) {
Packet4d tmp;
tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) {
Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) {
Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) {
Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) {
Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
// not needed yet
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
// {
// return _mm256_movemask_ps(x)==0xFF;
// }
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) {
return _mm256_movemask_ps(x) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4d& x) {
return _mm256_movemask_pd(x) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) {
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) {
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
}
#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) {
return _mm_movemask_epi8(x) != 0;
}
#endif // EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) {
return _mm_movemask_epi8(x) != 0;
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
@ -2361,24 +2249,64 @@ EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
return float2half(ptrunc<Packet8f>(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
constexpr uint16_t kAbsMask = (1 << 15) - 1;
return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
}
template <>
EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
constexpr uint16_t kAbsMask = (1 << 15) - 1;
return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
}
// convert the sign-magnitude representation to two's complement
EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
constexpr uint16_t kAbsMask = (1 << 15) - 1;
// if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
}
// return true if both `a` and `b` are not NaN
EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
constexpr uint16_t kAbsMask = (1 << 15) - 1;
__m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
__m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
// check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
// SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
}
template <>
EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
__m128i isOrdered = pisordered(a, b);
__m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
return _mm_and_si128(isOrdered, isEqual);
}
template <>
EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_le(half2float(a), half2float(b)));
__m128i isOrdered = pisordered(a, b);
__m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
return _mm_andnot_si128(isGreater, isOrdered);
}
template <>
EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
__m128i isOrdered = pisordered(a, b);
__m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
return _mm_and_si128(isOrdered, isLess);
}
template <>
EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
__m128i isUnordered = por(pisnan(a), pisnan(b));
__m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
return _mm_or_si128(isUnordered, isLess);
}
template <>
@ -2473,34 +2401,6 @@ EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const
to[stride * 7] = aux[7];
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux<Packet8f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_max<Packet8f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_min<Packet8f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_mul<Packet8f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@ -2859,26 +2759,6 @@ EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packe
to[stride * 7] = aux[7];
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);

View File

@ -0,0 +1,353 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_REDUCTIONS_AVX_H
#define EIGEN_REDUCTIONS_AVX_H
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8i -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE int predux(const Packet8i& a) {
Packet4i lo = _mm256_castsi256_si128(a);
Packet4i hi = _mm256_extractf128_si256(a, 1);
return predux(padd(lo, hi));
}
template <>
EIGEN_STRONG_INLINE int predux_mul(const Packet8i& a) {
Packet4i lo = _mm256_castsi256_si128(a);
Packet4i hi = _mm256_extractf128_si256(a, 1);
return predux_mul(pmul(lo, hi));
}
template <>
EIGEN_STRONG_INLINE int predux_min(const Packet8i& a) {
Packet4i lo = _mm256_castsi256_si128(a);
Packet4i hi = _mm256_extractf128_si256(a, 1);
return predux_min(pmin(lo, hi));
}
template <>
EIGEN_STRONG_INLINE int predux_max(const Packet8i& a) {
Packet4i lo = _mm256_castsi256_si128(a);
Packet4i hi = _mm256_extractf128_si256(a, 1);
return predux_max(pmax(lo, hi));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8i& a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_movemask_epi8(a) != 0x0;
#else
return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
#endif
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8ui -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) {
Packet4ui lo = _mm256_castsi256_si128(a);
Packet4ui hi = _mm256_extractf128_si256(a, 1);
return predux(padd(lo, hi));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet8ui& a) {
Packet4ui lo = _mm256_castsi256_si128(a);
Packet4ui hi = _mm256_extractf128_si256(a, 1);
return predux_mul(pmul(lo, hi));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_min(const Packet8ui& a) {
Packet4ui lo = _mm256_castsi256_si128(a);
Packet4ui hi = _mm256_extractf128_si256(a, 1);
return predux_min(pmin(lo, hi));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_max(const Packet8ui& a) {
Packet4ui lo = _mm256_castsi256_si128(a);
Packet4ui hi = _mm256_extractf128_si256(a, 1);
return predux_max(pmax(lo, hi));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_movemask_epi8(a) != 0x0;
#else
return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
#endif
}
#ifdef EIGEN_VECTORIZE_AVX2
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4l -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) {
Packet2l lo = _mm256_castsi256_si128(a);
Packet2l hi = _mm256_extractf128_si256(a, 1);
return predux(padd(lo, hi));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ul -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) {
return static_cast<uint64_t>(predux(Packet4l(a)));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
}
#endif
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8f -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE float predux(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux(padd(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_mul(pmul(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_min(pmin(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_max(pmax(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet8f& a) {
Packet4f lo = _mm256_castps256_ps128(a);
Packet4f hi = _mm256_extractf128_ps(a, 1);
return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
return _mm256_movemask_ps(a) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4d -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE double predux(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux(padd(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_mul(pmul(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_min(pmin(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_max(pmax(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet4d& a) {
Packet2d lo = _mm256_castpd256_pd128(a);
Packet2d hi = _mm256_extractf128_pd(a, 1);
return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4d& a) {
return _mm256_movemask_pd(a) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8h -- -- -- -- -- -- -- -- -- -- -- -- */
#ifndef EIGEN_VECTORIZE_AVX512FP16
template <>
EIGEN_STRONG_INLINE half predux(const Packet8h& a) {
return static_cast<half>(predux(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_mul(const Packet8h& a) {
return static_cast<half>(predux_mul(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) {
return static_cast<half>(predux_min(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet8h& a) {
return static_cast<half>(predux_min<PropagateNumbers>(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet8h& a) {
return static_cast<half>(predux_min<PropagateNaN>(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) {
return static_cast<half>(predux_max(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet8h& a) {
return static_cast<half>(predux_max<PropagateNumbers>(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet8h& a) {
return static_cast<half>(predux_max<PropagateNaN>(half2float(a)));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8h& a) {
return _mm_movemask_epi8(a) != 0;
}
#endif // EIGEN_VECTORIZE_AVX512FP16
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8bf -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) {
return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) {
return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) {
return static_cast<bfloat16>(predux_min(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) {
return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(a)));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& a) {
return _mm_movemask_epi8(a) != 0;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_REDUCTIONS_AVX_H

View File

@ -1494,40 +1494,6 @@ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d&
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
#endif
template <>
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f x = _mm256_add_ps(lane0, lane1);
return predux<Packet8f>(x);
#else
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
return predux<Packet4f>(sum);
#endif
}
template <>
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d sum = _mm256_add_pd(lane0, lane1);
return predux<Packet4d>(sum);
}
template <>
EIGEN_STRONG_INLINE int64_t predux<Packet8l>(const Packet8l& a) {
return _mm512_reduce_add_epi64(a);
}
template <>
EIGEN_STRONG_INLINE int predux<Packet16i>(const Packet16i& a) {
return _mm512_reduce_add_epi32(a);
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
@ -1574,136 +1540,6 @@ EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
return _mm256_add_epi64(lane0, lane1);
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
// #ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#else
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#endif
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE int predux_mul<Packet16i>(const Packet16i& a) {
return _mm512_reduce_mul_epi32(a);
}
#if EIGEN_COMP_MSVC
// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
// alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
// int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
// produces garbage: 4294967295. It seems to happen whenever the output is supposed to be negative.
// Fall back to a manual approach:
template <>
EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
Packet4l res = pmul(lane0, lane1);
res = pmul(res, Packet4l(_mm256_permute2x128_si256(res, res, 1)));
res = pmul(res, Packet4l(_mm256_shuffle_epi32(res, 0xE)));
return pfirst(res);
}
#else
template <>
EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
return _mm512_reduce_mul_epi64(a);
}
#endif
template <>
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_min_pd(lane0, lane1);
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE int predux_min<Packet16i>(const Packet16i& a) {
return _mm512_reduce_min_epi32(a);
}
template <>
EIGEN_STRONG_INLINE int64_t predux_min<Packet8l>(const Packet8l& a) {
return _mm512_reduce_min_epi64(a);
}
template <>
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_max_pd(lane0, lane1);
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE int predux_max<Packet16i>(const Packet16i& a) {
return _mm512_reduce_max_epi32(a);
}
template <>
EIGEN_STRONG_INLINE int64_t predux_max<Packet8l>(const Packet8l& a) {
return _mm512_reduce_max_epi64(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
return _mm512_reduce_or_epi32(a) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
return _mm512_reduce_or_epi64(a) != 0;
}
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
@ -2466,12 +2302,6 @@ EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet
return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
}
template <>
EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
}
template <>
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
Packet8h lane0 = _mm256_extractf128_si256(a, 0);
@ -2479,26 +2309,6 @@ EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
return padd<Packet8h>(lane0, lane1);
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
Packet16f af = half2float(a);
float reduced = predux_max<Packet16f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
Packet16f af = half2float(a);
float reduced = predux_min<Packet16f>(af);
return Eigen::half(reduced);
}
template <>
EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux_mul(from_float));
}
template <>
EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@ -3005,26 +2815,6 @@ EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a
return padd<Packet8bf>(lane0, lane1);
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
__m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,

View File

@ -0,0 +1,297 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_REDUCTIONS_AVX512_H
#define EIGEN_REDUCTIONS_AVX512_H
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16i -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE int predux(const Packet16i& a) {
return _mm512_reduce_add_epi32(a);
}
template <>
EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) {
return _mm512_reduce_mul_epi32(a);
}
template <>
EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) {
return _mm512_reduce_min_epi32(a);
}
template <>
EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) {
return _mm512_reduce_max_epi32(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
return _mm512_reduce_or_epi32(a) != 0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8l -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) {
return _mm512_reduce_add_epi64(a);
}
#if EIGEN_COMP_MSVC
// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
// alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
// int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
// produces garbage: 4294967295. It seems to happen whenever the output is supposed to be negative.
// Fall back to a manual approach:
template <>
EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {
Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
return predux_mul(pmul(lane0, lane1));
}
#else
template <>
EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
return _mm512_reduce_mul_epi64(a);
}
#endif
template <>
EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) {
return _mm512_reduce_min_epi64(a);
}
template <>
EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) {
return _mm512_reduce_max_epi64(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
return _mm512_reduce_or_epi64(a) != 0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16f -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE float predux(const Packet16f& a) {
return _mm512_reduce_add_ps(a);
}
template <>
EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) {
return _mm512_reduce_mul_ps(a);
}
template <>
EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
return _mm512_reduce_min_ps(a);
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet16f& a) {
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet16f& a) {
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
return _mm512_reduce_max_ps(a);
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet16f& a) {
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet16f& a) {
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8d -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE double predux(const Packet8d& a) {
return _mm512_reduce_add_pd(a);
}
template <>
EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) {
return _mm512_reduce_mul_pd(a);
}
template <>
EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) {
return _mm512_reduce_min_pd(a);
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) {
return _mm512_reduce_max_pd(a);
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
}
#ifndef EIGEN_VECTORIZE_AVX512FP16
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16h -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE half predux(const Packet16h& from) {
return half(predux(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) {
return half(predux_mul(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) {
return half(predux_min(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet16h& from) {
return half(predux_min<PropagateNumbers>(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet16h& from) {
return half(predux_min<PropagateNaN>(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) {
return half(predux_max(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet16h& from) {
return half(predux_max<PropagateNumbers>(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet16h& from) {
return half(predux_max<PropagateNaN>(half2float(from)));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16h& a) {
return predux_any<Packet8i>(a.m_val);
}
#endif
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16bf -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& from) {
return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) {
return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) {
return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) {
return static_cast<bfloat16>(predux_max(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet16bf& from) {
return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(from)));
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16bf& a) {
return predux_any<Packet8i>(a.m_val);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_REDUCTIONS_AVX512_H

View File

@ -129,30 +129,20 @@ EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a)
}
#ifdef EIGEN_VECTORIZE_VSX
// VSX support varies between different compilers and even different
// versions of the same compiler. For gcc version >= 4.9.3, we can use
// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
// a slow version that works with older compilers.
// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
template <>
inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
#if EIGEN_GNUC_STRICT_AT_LEAST(7, 1, 0)
return vec_cts(x, 0); // TODO: check clang version.
#else
double tmp[2];
memcpy(tmp, &x, sizeof(tmp));
Packet2l l = {static_cast<long long>(tmp[0]), static_cast<long long>(tmp[1])};
return l;
#endif
EIGEN_ALIGN_MAX double dtmp[2];
pstore(dtmp, x);
EIGEN_ALIGN_MAX long long itmp[2] = {static_cast<long long>(dtmp[0]), static_cast<long long>(dtmp[1])};
return vec_xl(0, itmp);
}
template <>
inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
unsigned long long tmp[2];
memcpy(tmp, &x, sizeof(tmp));
Packet2d d = {static_cast<double>(tmp[0]), static_cast<double>(tmp[1])};
return d;
EIGEN_ALIGN_MAX long long itmp[2];
vec_xst(x, 0, itmp);
EIGEN_ALIGN_MAX double dtmp[2] = {static_cast<double>(itmp[0]), static_cast<double>(itmp[1])};
return pload<Packet2d>(dtmp);
}
#endif

View File

@ -1689,7 +1689,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const
}
template <typename Packet>
struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
using Scalar = typename unpacket_traits<Packet>::type;
@ -1705,7 +1706,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
};
template <typename Packet>
struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@ -1724,7 +1726,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
};
template <typename Packet>
struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@ -1739,7 +1742,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
// \internal \returns the the sign of a complex number z, defined as z / abs(z).
template <typename Packet>
struct psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
unpacket_traits<Packet>::vectorizable>> {
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
@ -2176,7 +2180,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, c
// Generic implementation of pow(x,y).
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
const Packet& x, const Packet& y) {
typedef typename unpacket_traits<Packet>::type Scalar;
const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
@ -2266,6 +2271,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Pac
return pow;
}
template <typename Scalar>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
const Scalar& x, const Scalar& y) {
return numext::pow(x, y);
}
namespace unary_pow {
template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
@ -2347,35 +2358,36 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const Scal
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
const typename unpacket_traits<Packet>::type& exponent) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
const Packet exponent_packet = pset1<Packet>(exponent);
return generic_pow_impl(x, exponent_packet);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
const Scalar& x, const Scalar& exponent) {
return numext::pow(x, exponent);
}
template <typename Packet, typename ScalarExponent>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
const ScalarExponent& exponent) {
using Scalar = typename unpacket_traits<Packet>::type;
// non-integer base and exponent case
const Scalar pos_zero = Scalar(0);
const Scalar all_ones = ptrue<Scalar>(Scalar());
const Scalar pos_one = Scalar(1);
const Scalar pos_inf = NumTraits<Scalar>::infinity();
const Packet cst_pos_zero = pzero(x);
const Packet cst_pos_one = pset1<Packet>(pos_one);
const Packet cst_pos_inf = pset1<Packet>(pos_inf);
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
const Packet cst_true = ptrue<Packet>(x);
const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
const bool exponent_is_neg = exponent < ScalarExponent(0);
const bool exponent_is_pos = exponent > ScalarExponent(0);
const Packet exp_is_not_fin = pset1<Packet>(exponent_is_not_fin ? all_ones : pos_zero);
const Packet exp_is_neg = pset1<Packet>(exponent_is_neg ? all_ones : pos_zero);
const Packet exp_is_pos = pset1<Packet>(exponent_is_pos ? all_ones : pos_zero);
const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
@ -2411,22 +2423,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Pack
// This routine handles negative exponents.
// The return value is either 0, 1, or -1.
const Scalar pos_zero = Scalar(0);
const Scalar all_ones = ptrue<Scalar>(Scalar());
const Scalar pos_one = Scalar(1);
const Packet cst_pos_one = pset1<Packet>(pos_one);
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
const Packet exp_is_odd = pset1<Packet>(exponent_is_odd ? all_ones : pos_zero);
const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
const Packet abs_x = pabs(x);
const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
Packet result = pselect(exp_is_odd, x, abs_x);
result = pand(abs_x_is_one, result);
result = pselect(abs_x_is_one, result, pzero<Packet>(x));
return result;
}

View File

@ -497,16 +497,56 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
a = half(float(a) / float(b));
return a;
}
// Non-negative floating point numbers have a monotonic mapping to non-negative integers.
// This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
// is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
// representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
// two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
// infinity) are handled automatically, except NaN.
//
// fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
// bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
// NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
// convert sign-magnitude representation to two's complement
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
constexpr uint16_t kAbsMask = (1 << 15) - 1;
// If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
return (a >> 15) ? -(a & kAbsMask) : a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
constexpr uint16_t kAbsMask = (1 << 15) - 1;
return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
return numext::equal_strict(float(a), float(b));
bool result = mapToSigned(a.x) == mapToSigned(b.x);
result &= isOrdered(a, b);
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
return numext::not_equal_strict(float(a), float(b));
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
bool result = mapToSigned(a.x) < mapToSigned(b.x);
result &= isOrdered(a, b);
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
bool result = mapToSigned(a.x) <= mapToSigned(b.x);
result &= isOrdered(a, b);
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
bool result = mapToSigned(a.x) > mapToSigned(b.x);
result &= isOrdered(a, b);
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
bool result = mapToSigned(a.x) >= mapToSigned(b.x);
result &= isOrdered(a, b);
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }
#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
#pragma pop_macro("EIGEN_DEVICE_FUNC")
@ -706,7 +746,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
#else
return (a.x & 0x7fff) < 0x7c00;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {

View File

@ -31,6 +31,15 @@ namespace internal {
#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
#endif
// We need to distinguish clang as the CUDA compiler from clang as the host compiler,
// invoked by NVCC (e.g. on MacOS). The former needs to see both host and device implementation
// of the functions, while the latter can only deal with one of them.
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
#else
#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
#endif
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
@ -74,7 +83,10 @@ struct packet_traits<float> : default_packet_traits {
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
HasBlend = 0
HasBlend = 0,
HasFloor = 1,
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
};
};
@ -143,10 +155,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from)
return make_double2(from, from);
}
// We need to distinguish clang as the CUDA compiler from clang as the host compiler,
// invoked by NVCC (e.g. on MacOS). The former needs to see both host and device implementation
// of the functions, while the latter can only deal with one of them.
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
return __int_as_float(__float_as_int(a) & __float_as_int(b));
@ -259,8 +268,7 @@ template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
}
#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG &&
// !EIGEN_COMP_NVCC)
#endif // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {

View File

@ -1287,6 +1287,14 @@ template <>
EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return vfma_f32(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
return vfmsq_f32(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return vfms_f32(c, a, b);
}
#else
template <>
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
@ -1296,7 +1304,31 @@ template <>
EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return vmla_f32(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
return vmlsq_f32(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return vmls_f32(c, a, b);
}
#endif
template <>
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
return pnegate(pnmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return pnegate(pnmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
return pnegate(pmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
return pnegate(pmadd(a, b, c));
}
// No FMA instruction for int, so use MLA unconditionally.
template <>
@ -5242,13 +5274,28 @@ template <>
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return vfmaq_f64(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return vfmsq_f64(c, a, b);
}
#else
template <>
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return vmlaq_f64(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return vmlsq_f64(c, a, b);
}
#endif
template <>
EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return pnegate(pnmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
return pnegate(pmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
return vminq_f64(a, b);
@ -5657,18 +5704,33 @@ EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, cons
}
template <>
EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
return vfmaq_f16(pnegate(c), a, b);
EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
return vfmsq_f16(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
return vfma_f16(c, pnegate(a), b);
return vfms_f16(c, a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
return pnegate(pnmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
return pnegate(pnmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
return pnegate(pmadd(a, b, c));
}
template <>
EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
return vfma_f16(pnegate(c), pnegate(a), b);
return pnegate(pmadd(a, b, c));
}
template <>

View File

@ -1857,220 +1857,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
}
template <>
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
// #else
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
// #endif
}
template <>
EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// return pfirst<Packet2d>(_mm_hadd_pd(a, a));
// #else
return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
// #endif
}
template <>
EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
}
#ifdef EIGEN_VECTORIZE_SSSE3
template <>
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
Packet4i tmp0 = _mm_hadd_epi32(a, a);
return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
Packet4ui tmp0 = _mm_hadd_epi32(a, a);
return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
}
#else
template <>
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
}
template <>
EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
}
#endif
template <>
EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
}
// Other reduction functions:
// mul
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
}
template <>
EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
EIGEN_ALIGN16 int64_t aux[2];
pstore(aux, a);
return aux[0] * aux[1];
}
template <>
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (e.g., reusing pmul is very slow!)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
template <>
EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
}
// min
template <>
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
}
template <>
EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
return aux0 < aux2 ? aux0 : aux2;
#endif // EIGEN_VECTORIZE_SSE4_1
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
return aux0 < aux2 ? aux0 : aux2;
#endif // EIGEN_VECTORIZE_SSE4_1
}
// max
template <>
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
}
template <>
EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
return aux0 > aux2 ? aux0 : aux2;
#endif // EIGEN_VECTORIZE_SSE4_1
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
return aux0 > aux2 ? aux0 : aux2;
#endif // EIGEN_VECTORIZE_SSE4_1
}
// not needed yet
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
// {
// return _mm_movemask_ps(x) == 0xF;
// }
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) {
return _mm_movemask_pd(x) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
return _mm_movemask_ps(x) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) {
return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
}

View File

@ -0,0 +1,324 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_REDUCTIONS_SSE_H
#define EIGEN_REDUCTIONS_SSE_H
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
template <typename Packet>
struct sse_add_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return padd<Packet>(a, b); }
};
template <typename Packet>
struct sse_mul_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmul<Packet>(a, b); }
};
template <typename Packet>
struct sse_min_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmin<Packet>(a, b); }
};
template <int NaNPropagation, typename Packet>
struct sse_min_prop_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
return pmin<NaNPropagation, Packet>(a, b);
}
};
template <typename Packet>
struct sse_max_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmax<Packet>(a, b); }
};
template <int NaNPropagation, typename Packet>
struct sse_max_prop_wrapper {
static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
return pmax<NaNPropagation, Packet>(a, b);
}
};
template <typename Packet, typename Op>
struct sse_predux_common;
template <typename Packet>
struct sse_predux_impl : sse_predux_common<Packet, sse_add_wrapper<Packet>> {};
template <typename Packet>
struct sse_predux_mul_impl : sse_predux_common<Packet, sse_mul_wrapper<Packet>> {};
template <typename Packet>
struct sse_predux_min_impl : sse_predux_common<Packet, sse_min_wrapper<Packet>> {};
template <int NaNPropagation, typename Packet>
struct sse_predux_min_prop_impl : sse_predux_common<Packet, sse_min_prop_wrapper<NaNPropagation, Packet>> {};
template <typename Packet>
struct sse_predux_max_impl : sse_predux_common<Packet, sse_max_wrapper<Packet>> {};
template <int NaNPropagation, typename Packet>
struct sse_predux_max_prop_impl : sse_predux_common<Packet, sse_max_prop_wrapper<NaNPropagation, Packet>> {};
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16b -- -- -- -- -- -- -- -- -- -- -- -- */
template <>
EIGEN_STRONG_INLINE bool predux(const Packet16b& a) {
Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
}
template <>
EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) {
Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
}
template <>
EIGEN_STRONG_INLINE bool predux_min(const Packet16b& a) {
return predux_mul(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_max(const Packet16b& a) {
return predux(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet16b& a) {
return predux(a);
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4i -- -- -- -- -- -- -- -- -- -- -- -- */
template <typename Op>
struct sse_predux_common<Packet4i, Op> {
static EIGEN_STRONG_INLINE int run(const Packet4i& a) {
Packet4i tmp;
tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
return _mm_cvtsi128_si32(tmp);
}
};
template <>
EIGEN_STRONG_INLINE int predux(const Packet4i& a) {
return sse_predux_impl<Packet4i>::run(a);
}
template <>
EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) {
return sse_predux_mul_impl<Packet4i>::run(a);
}
#ifdef EIGEN_VECTORIZE_SSE4_1
template <>
EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) {
return sse_predux_min_impl<Packet4i>::run(a);
}
template <>
EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) {
return sse_predux_max_impl<Packet4i>::run(a);
}
#endif
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4i& a) {
return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ui -- -- -- -- -- -- -- -- -- -- -- -- */
template <typename Op>
struct sse_predux_common<Packet4ui, Op> {
static EIGEN_STRONG_INLINE uint32_t run(const Packet4ui& a) {
Packet4ui tmp;
tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
return static_cast<uint32_t>(_mm_cvtsi128_si32(tmp));
}
};
template <>
EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) {
return sse_predux_impl<Packet4ui>::run(a);
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) {
return sse_predux_mul_impl<Packet4ui>::run(a);
}
#ifdef EIGEN_VECTORIZE_SSE4_1
template <>
EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) {
return sse_predux_min_impl<Packet4ui>::run(a);
}
template <>
EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) {
return sse_predux_max_impl<Packet4ui>::run(a);
}
#endif
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& a) {
return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2l -- -- -- -- -- -- -- -- -- -- -- -- */
template <typename Op>
struct sse_predux_common<Packet2l, Op> {
static EIGEN_STRONG_INLINE int64_t run(const Packet2l& a) {
Packet2l tmp;
tmp = Op::packetOp(a, _mm_unpackhi_epi64(a, a));
return pfirst(tmp);
}
};
template <>
EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) {
return sse_predux_impl<Packet2l>::run(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2l& a) {
return _mm_movemask_pd(_mm_castsi128_pd(a)) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4f -- -- -- -- -- -- -- -- -- -- -- -- */
template <typename Op>
struct sse_predux_common<Packet4f, Op> {
static EIGEN_STRONG_INLINE float run(const Packet4f& a) {
Packet4f tmp;
tmp = Op::packetOp(a, _mm_movehl_ps(a, a));
#ifdef EIGEN_VECTORIZE_SSE3
tmp = Op::packetOp(tmp, _mm_movehdup_ps(tmp));
#else
tmp = Op::packetOp(tmp, _mm_shuffle_ps(tmp, tmp, 1));
#endif
return _mm_cvtss_f32(tmp);
}
};
template <>
EIGEN_STRONG_INLINE float predux(const Packet4f& a) {
return sse_predux_impl<Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) {
return sse_predux_mul_impl<Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) {
return sse_predux_min_impl<Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet4f& a) {
return sse_predux_min_prop_impl<PropagateNumbers, Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet4f& a) {
return sse_predux_min_prop_impl<PropagateNaN, Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) {
return sse_predux_max_impl<Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet4f& a) {
return sse_predux_max_prop_impl<PropagateNumbers, Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet4f& a) {
return sse_predux_max_prop_impl<PropagateNaN, Packet4f>::run(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& a) {
return _mm_movemask_ps(a) != 0x0;
}
/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2d -- -- -- -- -- -- -- -- -- -- -- -- */
template <typename Op>
struct sse_predux_common<Packet2d, Op> {
static EIGEN_STRONG_INLINE double run(const Packet2d& a) {
Packet2d tmp;
tmp = Op::packetOp(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(tmp);
}
};
template <>
EIGEN_STRONG_INLINE double predux(const Packet2d& a) {
return sse_predux_impl<Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) {
return sse_predux_mul_impl<Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) {
return sse_predux_min_impl<Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet2d& a) {
return sse_predux_min_prop_impl<PropagateNumbers, Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet2d& a) {
return sse_predux_min_prop_impl<PropagateNaN, Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) {
return sse_predux_max_impl<Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet2d& a) {
return sse_predux_max_prop_impl<PropagateNumbers, Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet2d& a) {
return sse_predux_max_prop_impl<PropagateNaN, Packet2d>::run(a);
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2d& a) {
return _mm_movemask_pd(a) != 0x0;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_REDUCTIONS_SSE_H

View File

@ -55,7 +55,7 @@ namespace internal {
ConjugateRhs, ColMajor, 1> { \
typedef gebp_traits<EIGTYPE, EIGTYPE> Traits; \
\
static void run(Index rows, Index cols, Index depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
static void run(Index rows, Index cols, Index depth, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_, \
Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha, \
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, GemmParallelInfo<Index>* /*info = 0*/) { \
using std::conj; \
@ -84,20 +84,20 @@ namespace internal {
\
/* Set a, b, c */ \
if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, m, k, OuterStride<>(lhsStride)); \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
} else \
a = _lhs; \
a = lhs_; \
\
if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, k, n, OuterStride<>(rhsStride)); \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
b = b_tmp.data(); \
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else \
b = _rhs; \
b = rhs_; \
\
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, \
(const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
@ -116,6 +116,88 @@ GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
#endif
// If OpenBLAS with BUILD_BFLOAT16=1 support is available,
// use sbgemm for bfloat16.
#if EIGEN_USE_OPENBLAS_BFLOAT16
extern "C" {
// OpenBLAS prototype.
void sbgemm_(const char* trans_a, const char* trans_b, const int* M, const int* N, const int* K, const float* alpha,
const Eigen::bfloat16* A, const int* lda, const Eigen::bfloat16* B, const int* ldb, const float* beta,
float* C, const int* ldc);
} // extern "C"
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>
struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, ConjugateLhs, Eigen::bfloat16,
RhsStorageOrder, ConjugateRhs, ColMajor, 1> {
typedef gebp_traits<Eigen::bfloat16, Eigen::bfloat16> Traits;
static void run(Index rows, Index cols, Index depth, const Eigen::bfloat16* lhs_, Index lhsStride,
const Eigen::bfloat16* rhs_, Index rhsStride, Eigen::bfloat16* res, Index resIncr, Index resStride,
Eigen::bfloat16 alpha, level3_blocking<Eigen::bfloat16, Eigen::bfloat16>& /*blocking*/,
GemmParallelInfo<Index>* /*info = 0*/) {
using std::conj;
if (rows == 0 || cols == 0 || depth == 0) return;
EIGEN_ONLY_USED_FOR_DEBUG(resIncr);
eigen_assert(resIncr == 1);
char transa, transb;
BlasIndex m, n, k, lda, ldb, ldc;
const Eigen::bfloat16 *a, *b;
float falpha = static_cast<float>(alpha);
float fbeta = float(1.0);
using MatrixXbf16 = Matrix<Eigen::bfloat16, Dynamic, Dynamic>;
MatrixXbf16 a_tmp, b_tmp;
MatrixXf r_tmp;
/* Set transpose options */
transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';
transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';
/* Set m, n, k */
m = convert_index<BlasIndex>(rows);
n = convert_index<BlasIndex>(cols);
k = convert_index<BlasIndex>(depth);
/* Set lda, ldb, ldc */
lda = convert_index<BlasIndex>(lhsStride);
ldb = convert_index<BlasIndex>(rhsStride);
ldc = convert_index<BlasIndex>(m);
/* Set a, b, c */
if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {
Map<const MatrixXbf16, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));
a_tmp = lhs.conjugate();
a = a_tmp.data();
lda = convert_index<BlasIndex>(a_tmp.outerStride());
} else {
a = lhs_;
}
if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {
Map<const MatrixXbf16, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));
b_tmp = rhs.conjugate();
b = b_tmp.data();
ldb = convert_index<BlasIndex>(b_tmp.outerStride());
} else {
b = rhs_;
}
// Evaluate to a temporary intermediate array.
r_tmp.resize(m, n);
sbgemm_(&transa, &transb, &m, &n, &k, (const float*)&numext::real_ref(falpha), a, &lda, b, &ldb,
(const float*)&numext::real_ref(fbeta), r_tmp.data(), &ldc);
// Cast to the output.
Map<MatrixXbf16, 0, OuterStride<> > result(res, m, n, OuterStride<>(resStride));
result = r_tmp.cast<Eigen::bfloat16>();
}
};
#endif // EIGEN_USE_OPENBLAS_SBGEMM
} // namespace internal
} // end namespace Eigen

View File

@ -164,6 +164,11 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
enum { LhsUpLo = LhsMode & (Upper | Lower) };
// Verify that the Rhs is a vector in the correct orientation.
// Otherwise, we break the assumption that we are multiplying
// MxN * Nx1.
static_assert(Rhs::ColsAtCompileTime == 1, "The RHS must be a column vector.");
template <typename Dest>
static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
typedef typename Dest::Scalar ResScalar;

View File

@ -8,7 +8,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
@ -98,4 +98,4 @@
#endif // gpu_assert
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H

View File

@ -8,7 +8,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#if defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
@ -40,6 +40,6 @@
#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#undef EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H

View File

@ -762,7 +762,7 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
* This is accomplished through alloca if this later is supported and if the required number of bytes
* is below EIGEN_STACK_ALLOCATION_LIMIT.
*/
#ifdef EIGEN_ALLOCA
#if defined(EIGEN_ALLOCA) && !defined(EIGEN_NO_ALLOCA)
#if EIGEN_DEFAULT_ALIGN_BYTES > 0
// We always manually re-align the result of EIGEN_ALLOCA.
@ -785,14 +785,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* pt
#define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
#endif
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER) \
Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
TYPE* NAME = (BUFFER) != 0 ? (BUFFER) \
: reinterpret_cast<TYPE*>((sizeof(TYPE) * SIZE <= EIGEN_STACK_ALLOCATION_LIMIT) \
? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * SIZE) \
: Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)( \
(BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * SIZE > EIGEN_STACK_ALLOCATION_LIMIT)
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER) \
Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
TYPE* NAME = (BUFFER) != 0 ? (BUFFER) \
: reinterpret_cast<TYPE*>((sizeof(TYPE) * (SIZE) <= EIGEN_STACK_ALLOCATION_LIMIT) \
? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * (SIZE)) \
: Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)( \
(BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * (SIZE) > EIGEN_STACK_ALLOCATION_LIMIT)
#define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \
Eigen::internal::local_nested_eval_wrapper<XPR_T, N> EIGEN_CAT(NAME, _wrapper)( \
@ -805,10 +805,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* pt
#else
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER) \
Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
TYPE* NAME = (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)( \
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER) \
Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
TYPE* NAME = \
(BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)( \
(BUFFER) == 0 ? NAME : 0, SIZE, true)
#define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \

View File

@ -345,7 +345,7 @@ EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorT
// Apply similarity transformation to remaining columns,
// i.e., A = H A H' where H = I - h v v' and v = matA.col(i).tail(n-i-1)
matA.col(i).coeffRef(i + 1) = (RealScalar)1;
matA.col(i).coeffRef(i + 1) = Scalar(1);
hCoeffs.tail(n - i - 1).noalias() =
(matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() *

View File

@ -85,6 +85,29 @@ class QuaternionBase : public RotationBase<Derived, 3> {
return derived().coeffs();
}
/** \returns a vector containing the coefficients, rearranged into the order [\c w, \c x, \c y, \c z].
*
* This is the order expected by the \code Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar&
* z) \endcode constructor, but not the order of the internal vector representation. Therefore, it returns a newly
* constructed vector.
*
* \sa QuaternionBase::coeffsScalarLast()
* */
EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarFirst() const {
return derived().coeffsScalarFirst();
}
/** \returns a vector containing the coefficients in their original order [\c x, \c y, \c z, \c w].
*
* This is equivalent to \code coeffs() \endcode, but returns a newly constructed vector for uniformity with \code
* coeffsScalarFirst() \endcode.
*
* \sa QuaternionBase::coeffsScalarFirst()
* */
EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarLast() const {
return derived().coeffsScalarLast();
}
/** \returns a vector expression of the coefficients (x,y,z,w) */
EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
@ -357,12 +380,23 @@ class Quaternion : public QuaternionBase<Quaternion<Scalar_, Options_> > {
EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarLast(const Scalar& x, const Scalar& y, const Scalar& z,
const Scalar& w);
EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarFirst(const Scalar& w, const Scalar& x, const Scalar& y,
const Scalar& z);
template <typename Derived1, typename Derived2>
EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
}
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))
#ifdef EIGEN_QUATERNION_PLUGIN
@ -437,6 +471,12 @@ class Map<const Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<const
EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
}
protected:
const Coefficients m_coeffs;
};
@ -473,6 +513,12 @@ class Map<Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<Quaternion<
EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
}
protected:
Coefficients m_coeffs;
};
@ -694,6 +740,35 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::UnitR
return Quaternion(a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
}
/** Constructs a quaternion from its coefficients in the order [\c x, \c y, \c z, \c w], i.e. vector part [\c x, \c y,
* \c z] first, scalar part \a w LAST.
*
* This factory accepts the parameters in the same order as the underlying coefficient vector. Consider using this
* factory function to make the parameter ordering explicit.
*/
template <typename Scalar, int Options>
EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarLast(const Scalar& x,
const Scalar& y,
const Scalar& z,
const Scalar& w) {
return Quaternion(w, x, y, z);
}
/** Constructs a quaternion from its coefficients in the order [\c w, \c x, \c y, \c z], i.e. scalar part \a w FIRST,
* vector part [\c x, \c y, \c z] last.
*
* This factory accepts the parameters in the same order as the constructor \code Quaternion(const Scalar& w, const
* Scalar& x, const Scalar& y, const Scalar& z) \endcode. Consider using this factory function to make the parameter
* ordering explicit.
*/
template <typename Scalar, int Options>
EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarFirst(const Scalar& w,
const Scalar& x,
const Scalar& y,
const Scalar& z) {
return Quaternion(w, x, y, z);
}
/** Returns a quaternion representing a rotation between
* the two arbitrary vectors \a a and \a b. In other words, the built
* rotation represent a rotation sending the line of direction \a a

View File

@ -78,6 +78,17 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> PermutationPType;
typedef typename MatrixType::PlainObject PlainObject;
/** \brief Reports whether the LU factorization was successful.
*
* \note This function always returns \c Success. It is provided for compatibility
* with other factorization routines.
* \returns \c Success
*/
ComputationInfo info() const {
eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
return Success;
}
/**
* \brief Default Constructor.
*

View File

@ -268,7 +268,7 @@ struct Assignment<DstXprType, Inverse<XprType>,
* \note This matrix must be invertible, otherwise the result is undefined. If you need an
* invertibility check, do the following:
* \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
* \li for the general case, use class FullPivLU.
* \li for the general case, use class PartialPivLU.
*
* Example: \include MatrixBase_inverse.cpp
* Output: \verbinclude MatrixBase_inverse.out

View File

@ -90,6 +90,17 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> TranspositionType;
typedef typename MatrixType::PlainObject PlainObject;
/** \brief Reports whether the LU factorization was successful.
*
* \note This function always returns \c Success. It is provided for compatibility
* with other factorization routines.
* \returns \c Success
*/
ComputationInfo info() const {
eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
return Success;
}
/**
* \brief Default Constructor.
*

View File

@ -82,6 +82,17 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
typedef typename MatrixType::PlainObject PlainObject;
/** \brief Reports whether the QR factorization was successful.
*
* \note This function always returns \c Success. It is provided for compatibility
* with other factorization routines.
* \returns \c Success
*/
ComputationInfo info() const {
eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
return Success;
}
/** \brief Default Constructor.
*
* The default constructor is useful in cases in which the user intends to

View File

@ -75,6 +75,17 @@ class HouseholderQR : public SolverBase<HouseholderQR<MatrixType_>> {
typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
HouseholderSequenceType;
/** \brief Reports whether the QR factorization was successful.
*
* \note This function always returns \c Success. It is provided for compatibility
* with other factorization routines.
* \returns \c Success
*/
ComputationInfo info() const {
eigen_assert(m_isInitialized && "HouseHolderQR is not initialized.");
return Success;
}
/**
* \brief Default Constructor.
*

View File

@ -165,7 +165,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
*
* \param matrix the matrix to decompose
*/
BDCSVD(const MatrixType& matrix) : m_algoswap(16), m_numIters(0) {
template <typename Derived>
BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
compute_impl(matrix, internal::get_computation_options(Options));
}
@ -181,7 +182,9 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
* \deprecated Will be removed in the next major Eigen version. Options should
* be specified in the \a Options template parameter.
*/
EIGEN_DEPRECATED BDCSVD(const MatrixType& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
template <typename Derived>
EIGEN_DEPRECATED BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions)
: m_algoswap(16), m_numIters(0) {
internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
compute_impl(matrix, computationOptions);
}
@ -193,7 +196,10 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
*
* \param matrix the matrix to decompose
*/
BDCSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
template <typename Derived>
BDCSVD& compute(const MatrixBase<Derived>& matrix) {
return compute_impl(matrix, m_computationOptions);
}
/** \brief Method performing the decomposition of given matrix, as specified by
* the `computationOptions` parameter.
@ -204,7 +210,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
* \deprecated Will be removed in the next major Eigen version. Options should
* be specified in the \a Options template parameter.
*/
EIGEN_DEPRECATED BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
template <typename Derived>
EIGEN_DEPRECATED BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
return compute_impl(matrix, computationOptions);
}
@ -215,7 +222,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
}
private:
BDCSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
template <typename Derived>
BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
@ -307,8 +315,13 @@ void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int
} // end allocate
template <typename MatrixType, int Options>
BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
template <typename Derived>
BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
unsigned int computationOptions) {
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
Input matrix must have the same Scalar type as the BDCSVD object.);
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
std::cout << "\n\n\n================================================================================================="
"=====================\n\n\n";

View File

@ -58,7 +58,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
// construct this by moving from a parent object
BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {}
void compute_impl_lapacke(const MatrixType& matrix, unsigned int computationOptions) {
template <typename Derived>
void compute_impl_lapacke(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
SVD::allocate(matrix.rows(), matrix.cols(), computationOptions);
SVD::m_nonzeroSingularValues = SVD::m_diagSize;
@ -120,8 +121,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
}
};
template <typename MatrixType_, int Options>
BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixType_& matrix,
template <typename MatrixType_, int Options, typename Derived>
BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixBase<Derived>& matrix,
int computationOptions) {
// we need to move to the wrapper type and back
BDCSVD_LAPACKE<MatrixType_, Options> tmpSvd(std::move(svd));
@ -134,12 +135,13 @@ BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd,
} // end namespace internal
#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS) \
template <> \
inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>& \
BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions); \
#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS) \
template <> \
template <typename Derived> \
inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>& \
BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
const MatrixBase<Derived>& matrix, unsigned int computationOptions) { \
return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions); \
}
#define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS) \

View File

@ -565,7 +565,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
*
* \param matrix the matrix to decompose
*/
explicit JacobiSVD(const MatrixType& matrix) { compute_impl(matrix, internal::get_computation_options(Options)); }
template <typename Derived>
explicit JacobiSVD(const MatrixBase<Derived>& matrix) {
compute_impl(matrix, internal::get_computation_options(Options));
}
/** \brief Constructor performing the decomposition of given matrix using specified options
* for computing unitaries.
@ -580,8 +583,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
* be specified in the \a Options template parameter.
*/
// EIGEN_DEPRECATED // TODO(cantonios): re-enable after fixing a few 3p libraries that error on deprecation warnings.
JacobiSVD(const MatrixType& matrix, unsigned int computationOptions) {
internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
template <typename Derived>
JacobiSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(computationOptions, matrix.rows(),
matrix.cols());
compute_impl(matrix, computationOptions);
}
@ -590,7 +595,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
*
* \param matrix the matrix to decompose
*/
JacobiSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
template <typename Derived>
JacobiSVD& compute(const MatrixBase<Derived>& matrix) {
return compute_impl(matrix, m_computationOptions);
}
/** \brief Method performing the decomposition of given matrix, as specified by
* the `computationOptions` parameter.
@ -601,8 +609,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
* \deprecated Will be removed in the next major Eigen version. Options should
* be specified in the \a Options template parameter.
*/
EIGEN_DEPRECATED JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
internal::check_svd_options_assertions<MatrixType, Options>(m_computationOptions, matrix.rows(), matrix.cols());
template <typename Derived>
EIGEN_DEPRECATED JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(m_computationOptions, matrix.rows(),
matrix.cols());
return compute_impl(matrix, computationOptions);
}
@ -626,7 +636,8 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
}
private:
JacobiSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
template <typename Derived>
JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
protected:
using Base::m_computationOptions;
@ -664,8 +675,13 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
};
template <typename MatrixType, int Options>
JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
template <typename Derived>
JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
unsigned int computationOptions) {
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
Input matrix must have the same Scalar type as the BDCSVD object.);
using std::abs;
allocate(matrix.rows(), matrix.cols(), computationOptions);

View File

@ -40,65 +40,65 @@ namespace Eigen {
/** \internal Specialization for the data types supported by LAPACKe */
#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \
template <> \
inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>& \
JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
/*typedef MatrixType::Scalar Scalar;*/ \
/*typedef MatrixType::RealScalar RealScalar;*/ \
allocate(matrix.rows(), matrix.cols(), computationOptions); \
\
/*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
m_nonzeroSingularValues = diagSize(); \
\
lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt; \
lapack_int matrix_order = LAPACKE_COLROW; \
char jobu, jobvt; \
LAPACKE_TYPE *u, *vt, dummy; \
jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
if (computeU()) { \
ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride()); \
u = (LAPACKE_TYPE*)m_matrixU.data(); \
} else { \
ldu = 1; \
u = &dummy; \
} \
MatrixType localV; \
lapack_int vt_rows = (m_computeFullV) ? internal::convert_index<lapack_int>(cols()) \
: (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize()) \
: 1; \
if (computeV()) { \
localV.resize(vt_rows, cols()); \
ldvt = internal::convert_index<lapack_int>(localV.outerStride()); \
vt = (LAPACKE_TYPE*)localV.data(); \
} else { \
ldvt = 1; \
vt = &dummy; \
} \
Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; \
superb.resize(diagSize(), 1); \
MatrixType m_temp; \
m_temp = matrix; \
lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd( \
matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()), \
internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda, \
(LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
/* Check the result of the LAPACK call */ \
if (info < 0 || !m_singularValues.allFinite()) { \
m_info = InvalidInput; \
} else if (info > 0) { \
m_info = NoConvergence; \
} else { \
m_info = Success; \
if (computeV()) m_matrixV = localV.adjoint(); \
} \
/* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; \
* m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
m_isInitialized = true; \
return *this; \
#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \
template <> \
template <typename Derived> \
inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>& \
JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
const MatrixBase<Derived>& matrix, unsigned int computationOptions) { \
/*typedef MatrixType::Scalar Scalar;*/ \
/*typedef MatrixType::RealScalar RealScalar;*/ \
allocate(matrix.rows(), matrix.cols(), computationOptions); \
\
/*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
m_nonzeroSingularValues = diagSize(); \
\
lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt; \
lapack_int matrix_order = LAPACKE_COLROW; \
char jobu, jobvt; \
LAPACKE_TYPE *u, *vt, dummy; \
jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
if (computeU()) { \
ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride()); \
u = (LAPACKE_TYPE*)m_matrixU.data(); \
} else { \
ldu = 1; \
u = &dummy; \
} \
MatrixType localV; \
lapack_int vt_rows = (m_computeFullV) ? internal::convert_index<lapack_int>(cols()) \
: (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize()) \
: 1; \
if (computeV()) { \
localV.resize(vt_rows, cols()); \
ldvt = internal::convert_index<lapack_int>(localV.outerStride()); \
vt = (LAPACKE_TYPE*)localV.data(); \
} else { \
ldvt = 1; \
vt = &dummy; \
} \
Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; \
superb.resize(diagSize(), 1); \
MatrixType m_temp; \
m_temp = matrix; \
lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd( \
matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()), \
internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda, \
(LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
/* Check the result of the LAPACK call */ \
if (info < 0 || !m_singularValues.allFinite()) { \
m_info = InvalidInput; \
} else if (info > 0) { \
m_info = NoConvergence; \
} else { \
m_info = Success; \
if (computeV()) m_matrixV = localV.adjoint(); \
} \
/* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; \
* m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
m_isInitialized = true; \
return *this; \
}
#define EIGEN_LAPACK_SVD_OPTIONS(OPTIONS) \

View File

@ -274,6 +274,10 @@ struct simpl_chol_helper {
}
};
// Symbol is ODR-used, so we need a definition.
template <typename Scalar, typename StorageIndex>
constexpr StorageIndex simpl_chol_helper<Scalar, StorageIndex>::kEmpty;
} // namespace internal
template <typename Derived>

View File

@ -36,10 +36,10 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
Scalar res1(0);
Scalar res2(0);
for (; i; ++i) {
res1 += numext::conj(i.value()) * other.coeff(i.index());
res1 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res1);
++i;
if (i) {
res2 += numext::conj(i.value()) * other.coeff(i.index());
res2 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res2);
}
}
return res1 + res2;

View File

@ -7,9 +7,7 @@
script:
- . ci/scripts/build.linux.script.sh
tags:
- linux
- eigen-runner
- cross-compiler
- saas-linux-2xlarge-amd64
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
@ -244,11 +242,13 @@ build:linux:rocm-latest:gcc-10:
EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
EIGEN_CI_ADDITIONAL_ARGS: >
-DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv7-a;-mfpu=neon-vfpv4
-DCMAKE_SYSTEM_NAME=Linux
-DCMAKE_CROSSCOMPILING_EMULATOR=qemu-arm-static;-L;/usr/arm-linux-gnueabihf
build:linux:cross:arm:gcc-10:default:
extends: .build:linux:cross:arm
variables:
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf qemu-user-static
EIGEN_CI_CROSS_C_COMPILER: arm-linux-gnueabihf-gcc-10
EIGEN_CI_CROSS_CXX_COMPILER: arm-linux-gnueabihf-g++-10
@ -258,7 +258,7 @@ build:linux:cross:arm:clang-12:default:
EIGEN_CI_INSTALL: clang-12
EIGEN_CI_C_COMPILER: clang-12
EIGEN_CI_CXX_COMPILER: clang++-12
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12 qemu-user-static
######## aarch64 ###############################################################
@ -268,6 +268,8 @@ build:linux:cross:arm:clang-12:default:
EIGEN_CI_TARGET_ARCH: aarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv8.2-a+fp16
tags:
- saas-linux-large-arm64
build:linux:cross:aarch64:gcc-10:default:
extends: .build:linux:cross:aarch64
@ -290,28 +292,27 @@ build:linux:cross:aarch64:clang-12:default:
.build:linux:cross:ppc64le:
extends: .build:linux:cross
image: ubuntu:24.04
variables:
EIGEN_CI_TARGET_ARCH: ppc64le
EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
EIGEN_CI_ADDITIONAL_ARGS: >-
-DCMAKE_SYSTEM_NAME=Linux
-DCMAKE_CROSSCOMPILING_EMULATOR=qemu-ppc64le-static;-L;/usr/powerpc64le-linux-gnu
build:linux:cross:ppc64le:gcc-10:default:
build:linux:cross:ppc64le:gcc-14:default:
extends: .build:linux:cross:ppc64le
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu
EIGEN_CI_CROSS_C_COMPILER: powerpc64le-linux-gnu-gcc-10
EIGEN_CI_CROSS_CXX_COMPILER: powerpc64le-linux-gnu-g++-10
# Temporarily disable MMA until #2457 is resolved.
EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_ALTIVEC_DISABLE_MMA=1"
EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu qemu-user-static
EIGEN_CI_CROSS_C_COMPILER: powerpc64le-linux-gnu-gcc-14
EIGEN_CI_CROSS_CXX_COMPILER: powerpc64le-linux-gnu-g++-14
build:linux:cross:ppc64le:clang-12:default:
build:linux:cross:ppc64le:clang-16:default:
extends: .build:linux:cross:ppc64le
variables:
EIGEN_CI_INSTALL: clang-12
EIGEN_CI_C_COMPILER: clang-12
EIGEN_CI_CXX_COMPILER: clang++-12
EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu clang-12
EIGEN_CI_C_COMPILER: clang-16
EIGEN_CI_CXX_COMPILER: clang++-16
EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu clang-16 qemu-user-static
######## loongarch64 #################################################
@ -320,17 +321,13 @@ build:linux:cross:ppc64le:clang-12:default:
variables:
EIGEN_CI_TARGET_ARCH: loongarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
tags:
- eigen-runner
- linux
- cross-compiler
# GCC-14 (minimum on Ubuntu 24)
build:linux:cross:loongarch64:gcc-14:default:
extends: .build:linux:cross:loongarch64
image: ubuntu:24.04
variables:
EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu
EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu qemu-user-static
EIGEN_CI_CROSS_C_COMPILER: loongarch64-linux-gnu-gcc-14
EIGEN_CI_CROSS_CXX_COMPILER: loongarch64-linux-gnu-g++-14
EIGEN_CI_ADDITIONAL_ARGS: >-

View File

@ -9,6 +9,8 @@
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_MERGE_REQUEST_LABELS =~ "/all-tests/"
tags:
- saas-linux-2xlarge-amd64
##### x86-64 ###################################################################
.test:linux:x86-64:
@ -16,10 +18,6 @@
variables:
EIGEN_CI_TARGET_ARCH: x86_64
EIGEN_CI_CROSS_TARGET_TRIPLE: x86_64-linux-gnu
tags:
- eigen-runner
- linux
- x86-64
# GCC-6 (minimum on Ubuntu 18.04)
.test:linux:x86-64:gcc-6:default:
@ -289,18 +287,13 @@ test:linux:cuda-12.2:clang-12:
variables:
EIGEN_CI_TARGET_ARCH: arm
EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
# Enable cross-compiled arm binary to run on aarch64.
EIGEN_CI_BEFORE_SCRIPT: "ln -s /usr/arm-linux-gnueabihf/lib/ld-linux-armhf.so.3 /lib/ && export LD_LIBRARY_PATH=/usr/arm-linux-gnueabihf/lib/"
tags:
- eigen-runner
- linux
- aarch64
EIGEN_CI_CTEST_ARGS: --timeout 2000
.test:linux:arm:gcc-10:default:
extends: .test:linux:arm
needs: [ build:linux:cross:arm:gcc-10:default ]
variables:
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf qemu-user-static
test:linux:arm:gcc-10:default:official:
extends: .test:linux:arm:gcc-10:default
@ -316,7 +309,7 @@ test:linux:arm:gcc-10:default:unsupported:
extends: .test:linux:arm
needs: [ build:linux:cross:arm:clang-12:default ]
variables:
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12 qemu-user-static
test:linux:arm:clang-12:default:official:
extends: .test:linux:arm:clang-12:default
@ -336,9 +329,7 @@ test:linux:arm:clang-12:default:unsupported:
EIGEN_CI_TARGET_ARCH: aarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
tags:
- eigen-runner
- linux
- aarch64
- saas-linux-large-arm64
.test:linux:aarch64:gcc-10:default:
extends: .test:linux:aarch64
@ -376,60 +367,54 @@ test:linux:aarch64:clang-12:default:unsupported:
.test:linux:ppc64le:
extends: .test:linux
image: ubuntu:24.04
variables:
EIGEN_CI_TARGET_ARCH: ppc64le
EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
tags:
- eigen-runner
- linux
- ppc64le
EIGEN_CI_CTEST_ARGS: --timeout 2000
.test:linux:ppc64le:gcc-10:default:
.test:linux:ppc64le:gcc-14:default:
extends: .test:linux:ppc64le
needs: [ build:linux:cross:ppc64le:gcc-10:default ]
needs: [ build:linux:cross:ppc64le:gcc-14:default ]
variables:
EIGEN_CI_INSTALL: g++-10
EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu qemu-user-static
test:linux:ppc64le:gcc-10:default:official:
extends: .test:linux:ppc64le:gcc-10:default
test:linux:ppc64le:gcc-14:default:official:
extends: .test:linux:ppc64le:gcc-14:default
variables:
EIGEN_CI_CTEST_LABEL: Official
test:linux:ppc64le:gcc-10:default:unsupported:
extends: .test:linux:ppc64le:gcc-10:default
test:linux:ppc64le:gcc-14:default:unsupported:
extends: .test:linux:ppc64le:gcc-14:default
variables:
EIGEN_CI_CTEST_LABEL: Unsupported
.test:linux:ppc64le:clang-12:default:
.test:linux:ppc64le:clang-16:default:
extends: .test:linux:ppc64le
needs: [ build:linux:cross:ppc64le:clang-12:default ]
needs: [ build:linux:cross:ppc64le:clang-16:default ]
variables:
EIGEN_CI_INSTALL: clang-12
EIGEN_CI_CROSS_INSTALL: g++-14-powerpc64le-linux-gnu clang-16 qemu-user-static
test:linux:ppc64le:clang-12:default:official:
extends: .test:linux:ppc64le:clang-12:default
test:linux:ppc64le:clang-16:default:official:
extends: .test:linux:ppc64le:clang-16:default
variables:
EIGEN_CI_CTEST_LABEL: Official
test:linux:ppc64le:clang-12:default:unsupported:
extends: .test:linux:ppc64le:clang-12:default
test:linux:ppc64le:clang-16:default:unsupported:
extends: .test:linux:ppc64le:clang-16:default
variables:
EIGEN_CI_CTEST_LABEL: Unsupported
##### loongarch64 ###################################################################
##### loongarch64 ##############################################################
.test:linux:loongarch64:
extends: .test:linux
image: ubuntu:24.04
variables:
EIGEN_CI_TARGET_ARCH: loongarch64
EIGEN_CI_CROSS_TARGET_TRIPLE: loongarch64-linux-gnu
# Install QEMU and set up the execution environment in the image
EIGEN_CI_CROSS_INSTALL: g++-14-loongarch64-linux-gnu qemu-user-static
EIGEN_CI_CTEST_ARGS: --timeout 2000
tags:
- eigen-runner
- linux
- cross-compiler
# GCC-14 (Ubuntu 24)
.test:linux:loongarch64:gcc-14:default:

View File

@ -16,7 +16,7 @@
#pragma GCC diagnostic ignored "-Wshadow"
#endif
#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
struct my_exception {
my_exception() {}
~my_exception() {}
@ -76,7 +76,7 @@ class AnnoyingScalar {
}
AnnoyingScalar operator+(const AnnoyingScalar& other) const {
#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
countdown--;
if (countdown <= 0 && !dont_throw) throw my_exception();
#endif

View File

@ -1340,7 +1340,7 @@ EIGEN_DECLARE_TEST(array_cwise) {
CALL_SUBTEST_3(array_generic(Array44d()));
CALL_SUBTEST_4(array_generic(
ArrayXXcf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
CALL_SUBTEST_7(array_generic(
CALL_SUBTEST_5(array_generic(
ArrayXXf(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
CALL_SUBTEST_8(array_generic(
ArrayXXi(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));

View File

@ -8,7 +8,7 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Various sanity tests with exceptions and non trivially copyable scalar type.
// - no memory leak when a custom scalar type trow an exceptions
// - no memory leak when a custom scalar type throw an exceptions
// - todo: complete the list of tests!
#define EIGEN_STACK_ALLOCATION_LIMIT 100000000
@ -21,9 +21,8 @@
AnnoyingScalar::countdown = 100; \
int before = AnnoyingScalar::instances; \
bool exception_thrown = false; \
try { \
OP; \
} catch (my_exception) { \
EIGEN_TRY { OP; } \
EIGEN_CATCH(my_exception) { \
exception_thrown = true; \
VERIFY(AnnoyingScalar::instances == before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
} \
@ -35,7 +34,11 @@ EIGEN_DECLARE_TEST(exceptions) {
typedef Eigen::Matrix<AnnoyingScalar, Dynamic, Dynamic> MatrixType;
{
#if defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW)
AnnoyingScalar::dont_throw = false;
#else
AnnoyingScalar::dont_throw = true;
#endif
int n = 50;
VectorType v0(n), v1(n);
MatrixType m0(n, n), m1(n, n), m2(n, n);

View File

@ -78,6 +78,19 @@ void quaternion(void) {
VERIFY(ss.str() == "0i + 0j + 0k + 1");
#endif
// Consistent handling of scalar first/last conventions regardless of Eigen's own coefficient layout
const Scalar w(a);
const Vector3 xyz(v0);
q1 = Quaternionx::FromCoeffsScalarFirst(w, xyz.x(), xyz.y(), xyz.z());
q2 = Quaternionx::FromCoeffsScalarLast(xyz.x(), xyz.y(), xyz.z(), w);
VERIFY_IS_EQUAL(q1, q2);
VERIFY_IS_EQUAL(q1.coeffsScalarFirst()[0], w);
VERIFY_IS_EQUAL(q1.coeffsScalarFirst()(seqN(1, 3)), xyz);
VERIFY_IS_EQUAL(q1.coeffsScalarLast()[3], w);
VERIFY_IS_EQUAL(q1.coeffsScalarLast()(seqN(0, 3)), xyz);
// concatenation
q1 *= q2;

View File

@ -4,7 +4,7 @@
#include <Eigen/Core>
// Allow gpu** macros for generic tests.
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
#include <Eigen/src/Core/util/GpuHipCudaDefines.inc>
// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
// doesn't allow std::tuple to compile for host code either. In these cases,

View File

@ -72,17 +72,16 @@ void test_conversion() {
// NaNs and infinities.
VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
VERIFY(!(numext::isnan)(float(half(0.0f))));
VERIFY((numext::isfinite)(float(half(65504.0f))));
VERIFY((numext::isfinite)(float(half(0.0f))));
VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
#if !EIGEN_COMP_MSVC
// Visual Studio errors out on divisions by 0
VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
#endif
VERIFY((numext::isnan)(float(NumTraits<half>::quiet_NaN())));
VERIFY((numext::isinf)(float(NumTraits<half>::infinity())));
VERIFY((numext::isinf)(float(-NumTraits<half>::infinity())));
// Exactly same checks as above, just directly on the half representation.
VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
@ -92,12 +91,9 @@ void test_conversion() {
VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
#if !EIGEN_COMP_MSVC
// Visual Studio errors out on divisions by 0
VERIFY((numext::isnan)(half(0.0 / 0.0)));
VERIFY((numext::isinf)(half(1.0 / 0.0)));
VERIFY((numext::isinf)(half(-1.0 / 0.0)));
#endif
VERIFY((numext::isnan)(NumTraits<half>::quiet_NaN()));
VERIFY((numext::isinf)(NumTraits<half>::infinity()));
VERIFY((numext::isinf)(-NumTraits<half>::infinity()));
// Conversion to bool
VERIFY(!static_cast<bool>(half(0.0)));
@ -204,19 +200,25 @@ void test_comparison() {
VERIFY(half(1.0f) != half(2.0f));
// Comparisons with NaNs and infinities.
#if !EIGEN_COMP_MSVC
// Visual Studio errors out on divisions by 0
VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
VERIFY(!(NumTraits<half>::quiet_NaN() == NumTraits<half>::quiet_NaN()));
VERIFY(NumTraits<half>::quiet_NaN() != NumTraits<half>::quiet_NaN());
VERIFY(!(half(1.0) == half(0.0 / 0.0)));
VERIFY(!(half(1.0) < half(0.0 / 0.0)));
VERIFY(!(half(1.0) > half(0.0 / 0.0)));
VERIFY(half(1.0) != half(0.0 / 0.0));
VERIFY(!(internal::random<half>() == NumTraits<half>::quiet_NaN()));
VERIFY(!(internal::random<half>() < NumTraits<half>::quiet_NaN()));
VERIFY(!(internal::random<half>() > NumTraits<half>::quiet_NaN()));
VERIFY(!(internal::random<half>() <= NumTraits<half>::quiet_NaN()));
VERIFY(!(internal::random<half>() >= NumTraits<half>::quiet_NaN()));
VERIFY(internal::random<half>() != NumTraits<half>::quiet_NaN());
VERIFY(half(1.0) < half(1.0 / 0.0));
VERIFY(half(1.0) > half(-1.0 / 0.0));
#endif
VERIFY(!(NumTraits<half>::quiet_NaN() == internal::random<half>()));
VERIFY(!(NumTraits<half>::quiet_NaN() < internal::random<half>()));
VERIFY(!(NumTraits<half>::quiet_NaN() > internal::random<half>()));
VERIFY(!(NumTraits<half>::quiet_NaN() <= internal::random<half>()));
VERIFY(!(NumTraits<half>::quiet_NaN() >= internal::random<half>()));
VERIFY(NumTraits<half>::quiet_NaN() != internal::random<half>());
VERIFY(internal::random<half>() < NumTraits<half>::infinity());
VERIFY(internal::random<half>() > -NumTraits<half>::infinity());
}
void test_basic_functions() {

View File

@ -343,7 +343,7 @@ static std::vector<std::string> eigen_assert_list;
#if !defined(EIGEN_TESTING_CONSTEXPR) && !defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
#define EIGEN_INTERNAL_DEBUGGING
#endif
#include <Eigen/QR> // required for createRandomPIMatrixOfRank and generateRandomMatrixSvs
#include <Eigen/Core>
inline void verify_impl(bool condition, const char* testname, const char* file, int line,
const char* condition_as_string) {
@ -935,3 +935,7 @@ int main(int argc, char* argv[]) {
#endif
#include "gpu_test_helper.h"
#ifndef EIGEN_TEST_MAX_SIZE
#define EIGEN_TEST_MAX_SIZE 320
#endif

View File

@ -1,6 +1,8 @@
#include "main.h"
#ifdef EIGEN_EXCEPTIONS
#include <exception> // std::exception
#endif
#include <Eigen/src/Core/util/MaxSizeVector.h>
@ -31,28 +33,27 @@ struct Foo {
std::cout << '~';
--Foo::object_count;
}
#ifdef EIGEN_EXCEPTIONS
class Fail : public std::exception {};
#endif
};
Index Foo::object_count = 0;
Index Foo::object_limit = 0;
EIGEN_DECLARE_TEST(cxx11_maxsizevector) {
EIGEN_DECLARE_TEST(maxsizevector) {
typedef MaxSizeVector<Foo> VectorX;
Foo::object_count = 0;
for (int r = 0; r < g_repeat; r++) {
Index rows = internal::random<Index>(3, 30);
Foo::object_limit = internal::random<Index>(0, rows - 2);
std::cout << "object_limit = " << Foo::object_limit << std::endl;
bool exception_raised = false;
#ifdef EIGEN_EXCEPTIONS
bool exception_raised = false;
try {
#endif
std::cout << "\nVectorX m(" << rows << ");\n";
VectorX vect(rows);
for (int i = 0; i < rows; ++i) vect.push_back(Foo());
#ifdef EIGEN_EXCEPTIONS
VERIFY(false); // not reached if exceptions are enabled
} catch (const Foo::Fail&) {
exception_raised = true;

View File

@ -354,28 +354,28 @@ void packetmath_boolean_mask_ops() {
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE1(internal::ptrue, internal::ptrue);
CHECK_CWISE1_MASK(internal::ptrue, internal::ptrue);
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(RealScalar(i));
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
// Test (-0) == (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
}
template <typename Scalar, typename Packet>
@ -384,28 +384,27 @@ void packetmath_boolean_mask_ops_real() {
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
EIGEN_ALIGN_MAX Scalar ref[size];
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
// Test (-0) <=/< (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
}
template <typename Scalar, typename Packet, typename EnableIf = void>
@ -422,31 +421,30 @@ struct packetmath_boolean_mask_ops_notcomplex_test<
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
EIGEN_ALIGN_MAX Scalar ref[size];
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
// Test (-0) <=/< (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
}
};
@ -700,11 +698,12 @@ void packetmath() {
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit);
}
} else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex) {
} else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex && !std::is_same<Scalar, bool>::value) {
// Prevent very small product results by adjusting range. Otherwise,
// we may end up with multiplying e.g. 32 Eigen::halfs with values < 1.
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>(Scalar(0.5), Scalar(1)) * (internal::random<bool>() ? Scalar(-1) : Scalar(1));
data1[i] = REF_MUL(internal::random<Scalar>(Scalar(0.5), Scalar(1)),
(internal::random<bool>() ? Scalar(-1) : Scalar(1)));
}
}
ref[0] = Scalar(1);

View File

@ -115,6 +115,30 @@ bool areApprox(const Scalar* a, const Scalar* b, int size, const typename NumTra
VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
}
#define CHECK_CWISE1_MASK(REFOP, POP) \
{ \
bool ref_mask[PacketSize] = {}; \
bool data_mask[PacketSize] = {}; \
internal::pstore(data2, POP(internal::pload<Packet>(data1))); \
for (int i = 0; i < PacketSize; ++i) { \
ref_mask[i] = numext::is_exactly_zero(REFOP(data1[i])); \
data_mask[i] = numext::is_exactly_zero(data2[i]); \
} \
VERIFY(test::areEqual(ref_mask, data_mask, PacketSize) && #POP); \
}
#define CHECK_CWISE2_MASK(REFOP, POP) \
{ \
bool ref_mask[PacketSize] = {}; \
bool data_mask[PacketSize] = {}; \
internal::pstore(data2, POP(internal::pload<Packet>(data1), internal::pload<Packet>(data1 + PacketSize))); \
for (int i = 0; i < PacketSize; ++i) { \
ref_mask[i] = numext::is_exactly_zero(REFOP(data1[i], data1[i + PacketSize])); \
data_mask[i] = numext::is_exactly_zero(data2[i]); \
} \
VERIFY(test::areEqual(ref_mask, data_mask, PacketSize) && #POP); \
}
// Checks component-wise for input of size N. All of data1, data2, and ref
// should have size at least ceil(N/PacketSize)*PacketSize to avoid memory
// access errors.

View File

@ -57,6 +57,10 @@ void product_selfadjoint(const MatrixType& m) {
v1.tail(rows - 1) * v2.head(cols - 1).adjoint() + v2.head(cols - 1) * v1.tail(rows - 1).adjoint();
VERIFY_IS_APPROX(m2, m3.template triangularView<Lower>().toDenseMatrix());
}
// matrix-vector
m2 = m1.template triangularView<Lower>();
VERIFY_IS_APPROX(m1 * m4, m2.template selfadjointView<Lower>() * m4);
}
EIGEN_DECLARE_TEST(product_selfadjoint) {

View File

@ -37,12 +37,9 @@ void matrixRedux(const MatrixType& m) {
m2.array() = m2.array() - kMaxVal * (m2.array() / kMaxVal);
}
VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
VERIFY_IS_APPROX(
MatrixType::Ones(rows, cols).sum(),
Scalar(float(
rows *
cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
VERIFY_IS_EQUAL(MatrixType::Zero(rows, cols).sum(), Scalar(0));
Scalar sizeAsScalar = internal::cast<Index, Scalar>(rows * cols);
VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), sizeAsScalar);
Scalar s(0), p(1), minc(numext::real(m1.coeff(0))), maxc(numext::real(m1.coeff(0)));
for (int j = 0; j < cols; j++)
for (int i = 0; i < rows; i++) {
@ -160,6 +157,10 @@ EIGEN_DECLARE_TEST(redux) {
int maxsize = (std::min)(100, EIGEN_TEST_MAX_SIZE);
TEST_SET_BUT_UNUSED_VARIABLE(maxsize);
for (int i = 0; i < g_repeat; i++) {
int rows = internal::random<int>(1, maxsize);
int cols = internal::random<int>(1, maxsize);
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(cols);
CALL_SUBTEST_1(matrixRedux(Matrix<float, 1, 1>()));
CALL_SUBTEST_1(matrixRedux(Array<float, 1, 1>()));
CALL_SUBTEST_2(matrixRedux(Matrix2f()));
@ -168,19 +169,37 @@ EIGEN_DECLARE_TEST(redux) {
CALL_SUBTEST_3(matrixRedux(Matrix4d()));
CALL_SUBTEST_3(matrixRedux(Array4d()));
CALL_SUBTEST_3(matrixRedux(Array44d()));
CALL_SUBTEST_4(matrixRedux(MatrixXcf(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_4(matrixRedux(ArrayXXcf(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_5(matrixRedux(MatrixXd(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_5(matrixRedux(ArrayXXd(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_6(matrixRedux(MatrixXi(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_6(matrixRedux(ArrayXXi(internal::random<int>(1, maxsize), internal::random<int>(1, maxsize))));
CALL_SUBTEST_4(matrixRedux(MatrixXf(rows, cols)));
CALL_SUBTEST_4(matrixRedux(ArrayXXf(rows, cols)));
CALL_SUBTEST_4(matrixRedux(MatrixXd(rows, cols)));
CALL_SUBTEST_4(matrixRedux(ArrayXXd(rows, cols)));
/* TODO: fix test for boolean */
/*CALL_SUBTEST_5(matrixRedux(MatrixX<bool>(rows, cols)));*/
/*CALL_SUBTEST_5(matrixRedux(ArrayXX<bool>(rows, cols)));*/
CALL_SUBTEST_5(matrixRedux(MatrixXi(rows, cols)));
CALL_SUBTEST_5(matrixRedux(ArrayXXi(rows, cols)));
CALL_SUBTEST_5(matrixRedux(MatrixX<int64_t>(rows, cols)));
CALL_SUBTEST_5(matrixRedux(ArrayXX<int64_t>(rows, cols)));
CALL_SUBTEST_6(matrixRedux(MatrixXcf(rows, cols)));
CALL_SUBTEST_6(matrixRedux(ArrayXXcf(rows, cols)));
CALL_SUBTEST_7(matrixRedux(MatrixXcd(rows, cols)));
CALL_SUBTEST_7(matrixRedux(ArrayXXcd(rows, cols)));
}
for (int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_7(vectorRedux(Vector4f()));
CALL_SUBTEST_7(vectorRedux(Array4f()));
CALL_SUBTEST_5(vectorRedux(VectorXd(internal::random<int>(1, maxsize))));
CALL_SUBTEST_5(vectorRedux(ArrayXd(internal::random<int>(1, maxsize))));
CALL_SUBTEST_8(vectorRedux(VectorXf(internal::random<int>(1, maxsize))));
CALL_SUBTEST_8(vectorRedux(ArrayXf(internal::random<int>(1, maxsize))));
int size = internal::random<int>(1, maxsize);
EIGEN_UNUSED_VARIABLE(size);
CALL_SUBTEST_8(vectorRedux(Vector4f()));
CALL_SUBTEST_8(vectorRedux(Array4f()));
CALL_SUBTEST_9(vectorRedux(VectorXf(size)));
CALL_SUBTEST_9(vectorRedux(ArrayXf(size)));
CALL_SUBTEST_10(vectorRedux(VectorXd(size)));
CALL_SUBTEST_10(vectorRedux(ArrayXd(size)));
/* TODO: fix test for boolean */
/*CALL_SUBTEST_10(vectorRedux(VectorX<bool>(size)));*/
/*CALL_SUBTEST_10(vectorRedux(ArrayX<bool>(size)));*/
CALL_SUBTEST_10(vectorRedux(VectorXi(size)));
CALL_SUBTEST_10(vectorRedux(ArrayXi(size)));
CALL_SUBTEST_10(vectorRedux(VectorX<int64_t>(size)));
CALL_SUBTEST_10(vectorRedux(ArrayX<int64_t>(size)));
}
}

View File

@ -9,6 +9,7 @@
#include "main.h"
#ifdef EIGEN_EXCEPTIONS
#define VERIFY_THROWS_BADALLOC(a) \
{ \
bool threw = false; \
@ -19,6 +20,10 @@
} \
VERIFY(threw && "should have thrown bad_alloc: " #a); \
}
#else
// No way to catch a bad alloc - program terminates.
#define VERIFY_THROWS_BADALLOC(a)
#endif
template <typename MatrixType>
void triggerMatrixBadAlloc(Index rows, Index cols) {

View File

@ -381,6 +381,7 @@ void svd_verify_assert_full_only(const MatrixType& input = MatrixType()) {
typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
RhsType rhs = RhsType::Zero(input.rows());
EIGEN_UNUSED_VARIABLE(rhs); // Only used if asserts are enabled.
MatrixType m(input.rows(), input.cols());
svd_fill_random(m);
@ -410,6 +411,7 @@ void svd_verify_assert(const MatrixType& input = MatrixType()) {
enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime };
typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, 1> RhsType;
RhsType rhs = RhsType::Zero(input.rows());
EIGEN_UNUSED_VARIABLE(rhs); // Only used if asserts are enabled.
MatrixType m(input.rows(), input.cols());
svd_fill_random(m);

View File

@ -214,6 +214,17 @@ void vectorwiseop_matrix(const MatrixType& m) {
VERIFY_IS_EQUAL(m1.real().middleCols(0, fix<0>).colwise().maxCoeff().eval().cols(), 0);
}
void vectorwiseop_mixedscalar() {
Matrix4cd a = Matrix4cd::Random();
Vector4cd b = Vector4cd::Random();
b.imag().setZero();
Vector4d b_real = b.real();
Matrix4cd c = a.array().rowwise() * b.array().transpose();
Matrix4cd d = a.array().rowwise() * b_real.array().transpose();
VERIFY_IS_CWISE_EQUAL(c, d);
}
EIGEN_DECLARE_TEST(vectorwiseop) {
CALL_SUBTEST_1(vectorwiseop_array(Array22cd()));
CALL_SUBTEST_2(vectorwiseop_array(Array<double, 3, 2>()));
@ -226,4 +237,5 @@ EIGEN_DECLARE_TEST(vectorwiseop) {
MatrixXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
CALL_SUBTEST_7(vectorwiseop_matrix(VectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
CALL_SUBTEST_7(vectorwiseop_matrix(RowVectorXd(internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
CALL_SUBTEST_8(vectorwiseop_mixedscalar());
}

View File

@ -10,19 +10,11 @@
#include "main.h"
template <typename MatrixType>
void matrixVisitor(const MatrixType& p) {
void matrixVisitor_impl(MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
Index rows = p.rows();
Index cols = p.cols();
// construct a random matrix where all coefficients are different
MatrixType m;
m = MatrixType::Random(rows, cols);
for (Index i = 0; i < m.size(); i++)
for (Index i2 = 0; i2 < i; i2++)
while (numext::equal_strict(m(i), m(i2))) // yes, strict equality
m(i) = internal::random<Scalar>();
Index rows = m.rows();
Index cols = m.cols();
Scalar minc = Scalar(1000), maxc = Scalar(-1000);
Index minrow = 0, mincol = 0, maxrow = 0, maxcol = 0;
@ -119,6 +111,22 @@ void matrixVisitor(const MatrixType& p) {
VERIFY((numext::isnan)(eigen_maxc));
}
}
template <typename MatrixType>
void matrixVisitor(const MatrixType& p) {
MatrixType m(p.rows(), p.cols());
// construct a random matrix where all coefficients are different
m.setRandom();
for (Index i = 0; i < m.size(); i++)
for (Index i2 = 0; i2 < i; i2++)
while (numext::equal_strict(m(i), m(i2))) // yes, strict equality
m(i) = internal::random<typename DenseBase<MatrixType>::Scalar>();
MatrixType n = m;
matrixVisitor_impl(m);
// force outer-inner access pattern
using BlockType = Block<MatrixType, Dynamic, Dynamic>;
BlockType m_block = n.block(0, 0, n.rows(), n.cols());
matrixVisitor_impl(m_block);
}
template <typename VectorType>
void vectorVisitor(const VectorType& w) {

View File

@ -24,6 +24,8 @@ void zeroReduction(const MatrixType& m) {
VERIFY_RAISES_ASSERT(m.minCoeff());
VERIFY_RAISES_ASSERT(m.maxCoeff());
Index i, j;
EIGEN_UNUSED_VARIABLE(i); // Only used if exceptions are enabled.
EIGEN_UNUSED_VARIABLE(j);
VERIFY_RAISES_ASSERT(m.minCoeff(&i, &j));
VERIFY_RAISES_ASSERT(m.maxCoeff(&i, &j));
VERIFY_RAISES_ASSERT(m.reshaped().minCoeff(&i));

View File

@ -45,7 +45,7 @@
#include <thread>
#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
#include "ThreadPool"
#include "../../../Eigen/ThreadPool"
#endif
#ifdef EIGEN_USE_GPU

File diff suppressed because it is too large Load Diff

View File

@ -10,14 +10,11 @@
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
// This header file container defines fo gpu* macros which will resolve to
// their equivalent hip* or cuda* versions depending on the compiler in use
// A separate header (included at the end of this file) will undefine all
#include "TensorGpuHipCudaDefines.h"
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
namespace Eigen {
static const int kGpuScratchSize = 1024;
@ -390,6 +387,6 @@ static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig co
} // end namespace Eigen
// undefine all the gpu* macros we defined at the beginning of the file
#include "TensorGpuHipCudaUndefines.h"
#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

View File

@ -37,12 +37,13 @@
* - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
* - MKL (https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html) : fastest, free -- may be
* incompatible with Eigen in GPL form.
* - pocketfft (https://gitlab.mpcdf.mpg.de/mtr/pocketfft) : faster than kissfft, BSD 3-clause.
* - PocketFFT/DUCC (https://gitlab.mpcdf.mpg.de/mtr/pocketfft, https://gitlab.mpcdf.mpg.de/mtr/ducc) : faster than kissfft, BSD 3-clause.
* It is a heavily modified implementation of FFTPack, with the following advantages:
* 1.strictly C++11 compliant
* 2.more accurate twiddle factor computation
* 3.very fast plan generation
* 4.worst case complexity for transform sizes with large prime factors is N*log(N), because Bluestein's algorithm is
* According to the author, DUCC contains the "evolution" of pocketfft, though the interface is very similar.
* used for these cases
*
* \section FFTDesign Design
@ -85,7 +86,7 @@
#ifdef EIGEN_FFTW_DEFAULT
// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
#include <fftw3.h>
#include "src/FFT/ei_fftw_impl.h"
#include "src/FFT/fftw_impl.h"
namespace Eigen {
// template <typename T> typedef struct internal::fftw_impl default_fft_impl; this does not work
template <typename T>
@ -93,7 +94,7 @@ struct default_fft_impl : public internal::fftw_impl<T> {};
} // namespace Eigen
#elif defined EIGEN_MKL_DEFAULT
// intel Math Kernel Library: fastest, free -- may be incompatible with Eigen in GPL form
#include "src/FFT/ei_imklfft_impl.h"
#include "src/FFT/imklfft_impl.h"
namespace Eigen {
template <typename T>
struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
@ -101,14 +102,24 @@ struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
#elif defined EIGEN_POCKETFFT_DEFAULT
// internal::pocketfft_impl: a heavily modified implementation of FFTPack, with many advantages.
#include <pocketfft_hdronly.h>
#include "src/FFT/ei_pocketfft_impl.h"
#include "src/FFT/pocketfft_impl.h"
namespace Eigen {
template <typename T>
struct default_fft_impl : public internal::pocketfft_impl<T> {};
} // namespace Eigen
#elif defined EIGEN_DUCCFFT_DEFAULT
#include <ducc0/fft/fft.h>
#include <ducc0/infra/string_utils.h>
#include <ducc0/fft/fft.h>
#include <ducc0/fft/fftnd_impl.h>
#include "src/FFT/duccfft_impl.h"
namespace Eigen {
template <typename T>
struct default_fft_impl : public internal::duccfft_impl<T> {};
} // namespace Eigen
#else
// internal::kissfft_impl: small, free, reasonably efficient default, derived from kissfft
#include "src/FFT/ei_kissfft_impl.h"
#include "src/FFT/kissfft_impl.h"
namespace Eigen {
template <typename T>
struct default_fft_impl : public internal::kissfft_impl<T> {};
@ -204,7 +215,8 @@ class FFT {
inline void fwd(Complex* dst, const Complex* src, Index nfft) { m_impl.fwd(dst, src, static_cast<int>(nfft)); }
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
defined EIGEN_MKL_DEFAULT
inline void fwd2(Complex* dst, const Complex* src, int n0, int n1) { m_impl.fwd2(dst, src, n0, n1); }
#endif
@ -366,7 +378,8 @@ class FFT {
inv(&dst[0], &src[0], nfft);
}
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
defined EIGEN_MKL_DEFAULT
inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
m_impl.inv2(dst, src, n0, n1);
if (HasFlag(Unscaled) == false) scale(dst, 1. / (n0 * n1), n0 * n1);
@ -385,7 +398,6 @@ class FFT {
Matrix<T_Data, Dynamic, 1>::Map(x, nx) *= s;
else
Matrix<T_Data, Dynamic, 1>::MapAligned(x, nx) *= s;
// Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
#endif
}

View File

@ -0,0 +1,71 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
namespace Eigen {
namespace internal {
template <typename _Scalar>
struct duccfft_impl {
using Scalar = _Scalar;
using Complex = std::complex<Scalar>;
using shape_t = ducc0::fmav_info::shape_t;
using stride_t = ducc0::fmav_info::stride_t;
inline void clear() {}
inline void fwd(Complex* dst, const Scalar* src, int nfft) {
const shape_t axes{0};
ducc0::cfmav<Scalar> m_in(src, shape_t{static_cast<size_t>(nfft)});
ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft) / 2 + 1});
ducc0::r2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
}
inline void fwd(Complex* dst, const Complex* src, int nfft) {
const shape_t axes{0};
ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
}
inline void inv(Scalar* dst, const Complex* src, int nfft) {
const shape_t axes{0};
ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft) / 2 + 1});
ducc0::vfmav<Scalar> m_out(dst, shape_t{static_cast<size_t>(nfft)});
ducc0::c2r(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
}
inline void inv(Complex* dst, const Complex* src, int nfft) {
const shape_t axes{0};
ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
}
inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
const shape_t axes{0, 1};
const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
ducc0::cfmav<Complex> m_in(src, in_shape, stride);
ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
}
inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
const shape_t axes{0, 1};
const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
ducc0::cfmav<Complex> m_in(src, in_shape, stride);
ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
}
};
} // namespace internal
} // namespace Eigen

View File

@ -5,17 +5,16 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
using namespace pocketfft;
using namespace pocketfft::detail;
namespace Eigen {
namespace internal {
template <typename _Scalar>
struct pocketfft_impl {
typedef _Scalar Scalar;
typedef std::complex<Scalar> Complex;
using Scalar = _Scalar;
using Complex = std::complex<Scalar>;
using shape_t = pocketfft::shape_t;
using stride_t = pocketfft::stride_t;
inline void clear() {}
@ -24,14 +23,14 @@ struct pocketfft_impl {
const shape_t axes_{0};
const stride_t stride_in{sizeof(Scalar)};
const stride_t stride_out{sizeof(Complex)};
r2c(shape_, stride_in, stride_out, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
pocketfft::r2c(shape_, stride_in, stride_out, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
}
inline void fwd(Complex* dst, const Complex* src, int nfft) {
const shape_t shape_{static_cast<size_t>(nfft)};
const shape_t axes_{0};
const stride_t stride_{sizeof(Complex)};
c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
}
inline void inv(Scalar* dst, const Complex* src, int nfft) {
@ -39,28 +38,28 @@ struct pocketfft_impl {
const shape_t axes_{0};
const stride_t stride_in{sizeof(Complex)};
const stride_t stride_out{sizeof(Scalar)};
c2r(shape_, stride_in, stride_out, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
pocketfft::c2r(shape_, stride_in, stride_out, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
}
inline void inv(Complex* dst, const Complex* src, int nfft) {
const shape_t shape_{static_cast<size_t>(nfft)};
const shape_t axes_{0};
const stride_t stride_{sizeof(Complex)};
c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
}
inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const shape_t axes_{0, 1};
const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
}
inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
const shape_t axes_{0, 1};
const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
}
};

View File

@ -284,12 +284,13 @@ template <typename MatrixType>
struct matrix_exp_computeUV<MatrixType, long double> {
template <typename ArgType>
static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) {
using Scalar = typename traits<MatrixType>::Scalar;
#if LDBL_MANT_DIG == 53 // double precision
matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
#else
using Scalar = typename traits<MatrixType>::Scalar;
using std::frexp;
using std::pow;
const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();

View File

@ -1455,7 +1455,7 @@ struct zeta_impl {
if (q <= zero) {
if (q == numext::floor(q)) {
if (x == numext::floor(x) && long(x) % 2 == 0) {
if (numext::rint(Scalar(0.5) * x) == Scalar(0.5) * x) {
return maxnum;
} else {
return nan;

View File

@ -88,6 +88,25 @@ else()
ei_add_property(EIGEN_MISSING_BACKENDS "pocketfft, ")
endif()
if( NOT DUCC_ROOT AND ENV{DUCC_ROOT} )
set( DUCC_ROOT $ENV{DUCC_ROOT} )
endif()
find_path(DUCCFFT
NAMES "src/ducc0/fft/fft.h"
PATHS ${DUCC_ROOT})
message(INFO " ${DUCC_ROOT} ${DUCCFFT}")
if(DUCCFFT)
ei_add_property(EIGEN_TESTED_BACKENDS "duccfft, ")
include_directories( "${DUCCFFT}/src" )
add_library(ducc_lib "${DUCCFFT}/src/ducc0/infra/string_utils.cc" "${DUCCFFT}/src/ducc0/infra/threading.cc")
target_compile_definitions(ducc_lib PUBLIC "DUCC0_NO_THREADING=1")
ei_add_test(duccfft "-DEIGEN_DUCCFFT_DEFAULT -DDUCC0_NO_THREADING=1" "ducc_lib" )
set_target_properties(ducc_lib duccfft PROPERTIES CXX_STANDARD 17)
else()
ei_add_property(EIGEN_MISSING_BACKENDS "duccfft, ")
endif()
option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
if(EIGEN_TEST_OPENGL)
find_package(OpenGL)

View File

@ -14,8 +14,6 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor;
template <int Layout>

View File

@ -17,8 +17,6 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;

View File

@ -17,8 +17,6 @@
#include "OffByOneScalar.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::RowMajor;
using Eigen::Tensor;

View File

@ -15,8 +15,6 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor;
void test_gpu_nullary() {

View File

@ -16,8 +16,6 @@
#include "main.h"
#include <Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
void test_gpu_random_uniform() {
Tensor<float, 2> out(72, 97);
out.setZero();

View File

@ -16,8 +16,6 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;

View File

@ -0,0 +1,4 @@
#define EIGEN_DUCCFFT_DEFAULT 1
#include <ducc0/fft/fft.h> // Needs to be included before main.h
#include <ducc0/fft/fftnd_impl.h> // Same requirement
#include "fft_test_shared.h"

View File

@ -272,7 +272,7 @@ EIGEN_DECLARE_TEST(FFTW) {
CALL_SUBTEST(test_scalar<float>(2 * 3 * 4 * 5 * 7));
CALL_SUBTEST(test_scalar<double>(2 * 3 * 4 * 5 * 7));
#if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT
#if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT
CALL_SUBTEST(test_complex<long double>(32));
CALL_SUBTEST(test_complex<long double>(256));
CALL_SUBTEST(test_complex<long double>(3 * 8));
@ -294,13 +294,15 @@ EIGEN_DECLARE_TEST(FFTW) {
// fail to build since Eigen limit the stack allocation size,too big here.
// CALL_SUBTEST( ( test_complex2d<long double, 256, 256> () ) );
#endif
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
defined EIGEN_MKL_DEFAULT
CALL_SUBTEST((test_complex2d<float, 24, 24>()));
CALL_SUBTEST((test_complex2d<float, 60, 60>()));
CALL_SUBTEST((test_complex2d<float, 24, 60>()));
CALL_SUBTEST((test_complex2d<float, 60, 24>()));
#endif
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
defined EIGEN_MKL_DEFAULT
CALL_SUBTEST((test_complex2d<double, 24, 24>()));
CALL_SUBTEST((test_complex2d<double, 60, 60>()));
CALL_SUBTEST((test_complex2d<double, 24, 60>()));