diff --git a/Eigen/Core b/Eigen/Core index 6ae069a92..8944d5450 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -192,45 +192,38 @@ using std::ptrdiff_t; #include "src/Core/arch/Default/BFloat16.h" #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" -#if defined EIGEN_VECTORIZE_AVX512 +#if defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" +#include "src/Core/arch/SSE/Reductions.h" +#include "src/Core/arch/SSE/Complex.h" +#include "src/Core/arch/SSE/TypeCasting.h" +#include "src/Core/arch/SSE/MathFunctions.h" +#endif + +#if defined EIGEN_VECTORIZE_AVX #include "src/Core/arch/AVX/PacketMath.h" +#include "src/Core/arch/AVX/Reductions.h" +#include "src/Core/arch/AVX/Complex.h" +#include "src/Core/arch/AVX/TypeCasting.h" +#include "src/Core/arch/AVX/MathFunctions.h" +#endif + +#if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/AVX512/PacketMath.h" +#include "src/Core/arch/AVX512/Reductions.h" +#include "src/Core/arch/AVX512/Complex.h" +#include "src/Core/arch/AVX512/TypeCasting.h" +#include "src/Core/arch/AVX512/MathFunctions.h" +#include "src/Core/arch/AVX512/TrsmKernel.h" +#endif + #if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/PacketMathFP16.h" -#endif -#include "src/Core/arch/SSE/TypeCasting.h" -#include "src/Core/arch/AVX/TypeCasting.h" -#include "src/Core/arch/AVX512/TypeCasting.h" -#if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/TypeCastingFP16.h" -#endif -#include "src/Core/arch/SSE/Complex.h" -#include "src/Core/arch/AVX/Complex.h" -#include "src/Core/arch/AVX512/Complex.h" -#include "src/Core/arch/SSE/MathFunctions.h" -#include "src/Core/arch/AVX/MathFunctions.h" -#include "src/Core/arch/AVX512/MathFunctions.h" -#if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/MathFunctionsFP16.h" #endif -#include "src/Core/arch/AVX512/TrsmKernel.h" -#elif defined EIGEN_VECTORIZE_AVX - // Use AVX for floats and doubles, SSE for integers -#include "src/Core/arch/SSE/PacketMath.h" -#include "src/Core/arch/SSE/TypeCasting.h" -#include "src/Core/arch/SSE/Complex.h" -#include "src/Core/arch/AVX/PacketMath.h" -#include "src/Core/arch/AVX/TypeCasting.h" -#include "src/Core/arch/AVX/Complex.h" -#include "src/Core/arch/SSE/MathFunctions.h" -#include "src/Core/arch/AVX/MathFunctions.h" -#elif defined EIGEN_VECTORIZE_SSE -#include "src/Core/arch/SSE/PacketMath.h" -#include "src/Core/arch/SSE/TypeCasting.h" -#include "src/Core/arch/SSE/MathFunctions.h" -#include "src/Core/arch/SSE/Complex.h" -#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) + +#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/TypeCasting.h" #include "src/Core/arch/AltiVec/MathFunctions.h" diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 470e36d8d..1b1d326b3 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -654,25 +654,6 @@ template <> EIGEN_STRONG_INLINE uint64_t pfirst(const Packet4ul& a) { return _mm_extract_epi64_0(_mm256_castsi256_si128(a)); } -template <> -EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) { - __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); - return _mm_extract_epi64_0(r) + _mm_extract_epi64_1(r); -} -template <> -EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) { - __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); - return numext::bit_cast(_mm_extract_epi64_0(r) + _mm_extract_epi64_1(r)); -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) { - return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0; -} -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) { - return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0; -} #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M) EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -1955,23 +1936,6 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast(const Packet4d& a, const Pack return pmul(a, c); // a * 2^e } -template <> -EIGEN_STRONG_INLINE float predux(const Packet8f& a) { - return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux(const Packet4d& a) { - return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1)))); -} -template <> -EIGEN_STRONG_INLINE int predux(const Packet8i& a) { - return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)))); -} -template <> -EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) { - return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)))); -} - template <> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)); @@ -1985,82 +1949,6 @@ EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4(const Packet8ui& a) return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); } -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) { - Packet8f tmp; - tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1)); - tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2))); - return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1))); -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) { - Packet4d tmp; - tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1)); - return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { - Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1)); - tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2))); - return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { - Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1)); - return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { - Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1)); - tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2))); - return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1))); -} - -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { - Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1)); - return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); -} - -// not needed yet -// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) -// { -// return _mm256_movemask_ps(x)==0xFF; -// } - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - return _mm256_movemask_ps(x) != 0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4d& x) { - return _mm256_movemask_pd(x) != 0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) { - return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0; -} -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) { - return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0; -} - -#ifndef EIGEN_VECTORIZE_AVX512FP16 -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8h& x) { - return _mm_movemask_epi8(x) != 0; -} -#endif // EIGEN_VECTORIZE_AVX512FP16 - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& x) { - return _mm_movemask_epi8(x) != 0; -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); @@ -2473,34 +2361,6 @@ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const to[stride * 7] = aux[7]; } -template <> -EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template <> -EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template <> -EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template <> -EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - template <> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) { __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); @@ -2859,26 +2719,6 @@ EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packe to[stride * 7] = aux[7]; } -template <> -EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) { - return static_cast(predux(Bf16ToF32(a))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { - return static_cast(predux_max(Bf16ToF32(a))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { - return static_cast(predux_min(Bf16ToF32(a))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) { - return static_cast(predux_mul(Bf16ToF32(a))); -} - template <> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) { __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); diff --git a/Eigen/src/Core/arch/AVX/Reductions.h b/Eigen/src/Core/arch/AVX/Reductions.h new file mode 100644 index 000000000..8eed4be31 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/Reductions.h @@ -0,0 +1,393 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_REDUCTIONS_AVX_H +#define EIGEN_REDUCTIONS_AVX_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8i -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE int predux(const Packet8i& a) { + Packet4i lo = _mm256_castsi256_si128(a); + Packet4i hi = _mm256_extractf128_si256(a, 1); + return predux(padd(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE int predux_mul(const Packet8i& a) { + Packet4i lo = _mm256_castsi256_si128(a); + Packet4i hi = _mm256_extractf128_si256(a, 1); + return predux_mul(pmul(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE int predux_min(const Packet8i& a) { + Packet4i lo = _mm256_castsi256_si128(a); + Packet4i hi = _mm256_extractf128_si256(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE int predux_max(const Packet8i& a) { + Packet4i lo = _mm256_castsi256_si128(a); + Packet4i hi = _mm256_extractf128_si256(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_movemask_epi8(a) != 0x0; +#else + return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0; +#endif +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8ui -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) { + Packet4ui lo = _mm256_castsi256_si128(a); + Packet4ui hi = _mm256_extractf128_si256(a, 1); + return predux(padd(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet8ui& a) { + Packet4ui lo = _mm256_castsi256_si128(a); + Packet4ui hi = _mm256_extractf128_si256(a, 1); + return predux_mul(pmul(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE uint32_t predux_min(const Packet8ui& a) { + Packet4ui lo = _mm256_castsi256_si128(a); + Packet4ui hi = _mm256_extractf128_si256(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE uint32_t predux_max(const Packet8ui& a) { + Packet4ui lo = _mm256_castsi256_si128(a); + Packet4ui hi = _mm256_extractf128_si256(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_movemask_epi8(a) != 0x0; +#else + return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0; +#endif +} + +#ifdef EIGEN_VECTORIZE_AVX2 + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4l -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) { + Packet2l lo = _mm256_castsi256_si128(a); + Packet2l hi = _mm256_extractf128_si256(a, 1); + return predux(padd(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) { + return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ul -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) { + return static_cast(predux(Packet4l(a))); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) { + return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0; +} + +#endif + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8f -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE float predux(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux(padd(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_mul(pmul(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + return predux_min(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + return predux_max(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + Packet4f lo = _mm256_castps256_ps128(a); + Packet4f hi = _mm256_extractf128_ps(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) { + return _mm256_movemask_ps(a) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4d -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE double predux(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux(padd(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_mul(pmul(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { + return predux_min(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_min(pmin(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { + return predux_max(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { + Packet2d lo = _mm256_castpd256_pd128(a); + Packet2d hi = _mm256_extractf128_pd(a, 1); + return predux_max(pmax(lo, hi)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4d& a) { + return _mm256_movemask_pd(a) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8h -- -- -- -- -- -- -- -- -- -- -- -- */ +#ifndef EIGEN_VECTORIZE_AVX512FP16 + +template <> +EIGEN_STRONG_INLINE half predux(const Packet8h& a) { + return static_cast(predux(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_mul(const Packet8h& a) { + return static_cast(predux_mul(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) { + return static_cast(predux_min(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) { + return static_cast(predux_min(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) { + return static_cast(predux_min(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) { + return static_cast(predux_min(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) { + return static_cast(predux_max(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) { + return static_cast(predux_max(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) { + return static_cast(predux_max(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) { + return static_cast(predux_max(half2float(a))); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8h& a) { + return _mm_movemask_epi8(a) != 0; +} +#endif // EIGEN_VECTORIZE_AVX512FP16 + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8bf -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) { + return static_cast(predux(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) { + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return predux_min(a); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return predux_max(a); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& a) { + return _mm_movemask_epi8(a) != 0; +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_REDUCTIONS_AVX_H diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 27a0f1023..932b0568d 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1494,40 +1494,6 @@ EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3); #endif -template <> -EIGEN_STRONG_INLINE float predux(const Packet16f& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - __m256 lane0 = _mm512_extractf32x8_ps(a, 0); - __m256 lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f x = _mm256_add_ps(lane0, lane1); - return predux(x); -#else - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); - return predux(sum); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d sum = _mm256_add_pd(lane0, lane1); - return predux(sum); -} - -template <> -EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) { - return _mm512_reduce_add_epi64(a); -} - -template <> -EIGEN_STRONG_INLINE int predux(const Packet16i& a) { - return _mm512_reduce_add_epi32(a); -} - template <> EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ @@ -1574,136 +1540,6 @@ EIGEN_STRONG_INLINE Packet4l predux_half_dowto4(const Packet8l& a) { return _mm256_add_epi64(lane0, lane1); } -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { -// #ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#else - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); -} -template <> -EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) { - return _mm512_reduce_mul_epi32(a); -} - -#if EIGEN_COMP_MSVC -// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939. -// alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 }; -// int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data)); -// produces garbage: 4294967295. It seems to happen whenever the output is supposed to be negative. -// Fall back to a manual approach: -template <> -EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { - Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0); - Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1); - Packet4l res = pmul(lane0, lane1); - res = pmul(res, Packet4l(_mm256_permute2x128_si256(res, res, 1))); - res = pmul(res, Packet4l(_mm256_shuffle_epi32(res, 0xE))); - return pfirst(res); -} -#else -template <> -EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { - return _mm512_reduce_mul_epi64(a); -} -#endif - -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); - res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = _mm256_min_pd(lane0, lane1); - res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); -} -template <> -EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) { - return _mm512_reduce_min_epi32(a); -} -template <> -EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) { - return _mm512_reduce_min_epi64(a); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); - res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} - -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = _mm256_max_pd(lane0, lane1); - res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); -} -template <> -EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) { - return _mm512_reduce_max_epi32(a); -} -template <> -EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) { - return _mm512_reduce_max_epi64(a); -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) { - return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) { - return _mm512_reduce_or_epi32(a) != 0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) { - return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) { - return _mm512_reduce_or_epi64(a) != 0; -} - #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); @@ -2466,12 +2302,6 @@ EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet return float2half(pnmsub(half2float(a), half2float(b), half2float(c))); } -template <> -EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - template <> EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { Packet8h lane0 = _mm256_extractf128_si256(a, 0); @@ -2479,26 +2309,6 @@ EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { return padd(lane0, lane1); } -template <> -EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet16h& a) { - Packet16f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template <> -EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet16h& a) { - Packet16f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template <> -EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux_mul(from_float)); -} - template <> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) { __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); @@ -3005,26 +2815,6 @@ EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4(const Packet16bf& a return padd(lane0, lane1); } -template <> -EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& p) { - return static_cast(predux(Bf16ToF32(p))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) { - return static_cast(predux_mul(Bf16ToF32(from))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { - return static_cast(predux_min(Bf16ToF32(from))); -} - -template <> -EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { - return static_cast(predux_max(Bf16ToF32(from))); -} - template <> EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) { __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, diff --git a/Eigen/src/Core/arch/AVX512/Reductions.h b/Eigen/src/Core/arch/AVX512/Reductions.h new file mode 100644 index 000000000..e6d5bae26 --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/Reductions.h @@ -0,0 +1,337 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_REDUCTIONS_AVX512_H +#define EIGEN_REDUCTIONS_AVX512_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16i -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE int predux(const Packet16i& a) { + return _mm512_reduce_add_epi32(a); +} + +template <> +EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) { + return _mm512_reduce_mul_epi32(a); +} + +template <> +EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) { + return _mm512_reduce_min_epi32(a); +} + +template <> +EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) { + return _mm512_reduce_max_epi32(a); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) { + return _mm512_reduce_or_epi32(a) != 0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8l -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) { + return _mm512_reduce_add_epi64(a); +} + +#if EIGEN_COMP_MSVC +// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939. +// alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 }; +// int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data)); +// produces garbage: 4294967295. It seems to happen whenever the output is supposed to be negative. +// Fall back to a manual approach: +template <> +EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { + Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0); + Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1); + return predux_mul(pmul(lane0, lane1)); +} +#else +template <> +EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { + return _mm512_reduce_mul_epi64(a); +} +#endif + +template <> +EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) { + return _mm512_reduce_min_epi64(a); +} + +template <> +EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) { + return _mm512_reduce_max_epi64(a); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) { + return _mm512_reduce_or_epi64(a) != 0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16f -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { + return _mm512_reduce_add_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { + return _mm512_reduce_mul_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + return _mm512_reduce_min_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + return _mm512_reduce_min_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return predux_min(pmin(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return predux_min(pmin(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + return _mm512_reduce_max_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + return _mm512_reduce_max_ps(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return predux_max(pmax(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return predux_max(pmax(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) { + return _mm512_reduce_or_epi32(_mm512_castps_si512(x)) != 0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8d -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE double predux(const Packet8d& a) { + return _mm512_reduce_add_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { + return _mm512_reduce_mul_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + return _mm512_reduce_min_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + return _mm512_reduce_min_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + return predux_min(pmin(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + return predux_min(pmin(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + return _mm512_reduce_max_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + return _mm512_reduce_max_pd(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + return predux_max(pmax(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + return predux_max(pmax(lane0, lane1)); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8d& x) { + return _mm512_reduce_or_epi64(_mm512_castpd_si512(x)) != 0; +} + +#ifndef EIGEN_VECTORIZE_AVX512FP16 +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16h -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + return half(predux(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + return half(predux_mul(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) { + return half(predux_min(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) { + return half(predux_min(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) { + return half(predux_min(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) { + return half(predux_min(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) { + return half(predux_max(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) { + return half(predux_max(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) { + return half(predux_max(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) { + return half(predux_max(half2float(from))); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16h& x) { + return predux_any(x.m_val); +} +#endif + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16bf -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& from) { + return static_cast(predux(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) { + return static_cast(predux_mul(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16bf& x) { + return predux_any(x.m_val); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_REDUCTIONS_AVX512_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 70d13d6af..e8902cff6 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -1857,220 +1857,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); } -template <> -EIGEN_STRONG_INLINE float predux(const Packet4f& a) { - // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures - // (from Nehalem to Haswell) - // #ifdef EIGEN_VECTORIZE_SSE3 - // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); - // return pfirst(_mm_hadd_ps(tmp, tmp)); - // #else - Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a)); - return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1))); - // #endif -} - -template <> -EIGEN_STRONG_INLINE double predux(const Packet2d& a) { - // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures - // (from Nehalem to Haswell) - // #ifdef EIGEN_VECTORIZE_SSE3 - // return pfirst(_mm_hadd_pd(a, a)); - // #else - return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a, a))); - // #endif -} - -template <> -EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { - return pfirst(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a))); -} - -#ifdef EIGEN_VECTORIZE_SSSE3 -template <> -EIGEN_STRONG_INLINE int predux(const Packet4i& a) { - Packet4i tmp0 = _mm_hadd_epi32(a, a); - return pfirst(_mm_hadd_epi32(tmp0, tmp0)); -} -template <> -EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { - Packet4ui tmp0 = _mm_hadd_epi32(a, a); - return pfirst(_mm_hadd_epi32(tmp0, tmp0)); -} -#else -template <> -EIGEN_STRONG_INLINE int predux(const Packet4i& a) { - Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); -} -template <> -EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { - Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); -} -#endif - -template <> -EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { - Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a)); - return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); -} - -// Other reduction functions: - -// mul -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { - Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1))); -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a, a))); -} -template <> -EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { - EIGEN_ALIGN16 int64_t aux[2]; - pstore(aux, a); - return aux[0] * aux[1]; -} -template <> -EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (e.g., reusing pmul is very slow!) - // TODO try to call _mm_mul_epu32 directly - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]); -} -template <> -EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) { - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., reusing pmul is very slow !) - // TODO try to call _mm_mul_epu32 directly - EIGEN_ALIGN16 uint32_t aux[4]; - pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]); -} - -template <> -EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) { - Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a)); - return ((pfirst(tmp) == 0x01010101) && (pfirst(_mm_shuffle_epi32(tmp, 1)) == 0x01010101)); -} - -// min -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { - Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a, a))); -} -template <> -EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0] < aux[1] ? aux[0] : aux[1]; - int aux2 = aux[2] < aux[3] ? aux[2] : aux[3]; - return aux0 < aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} -template <> -EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 uint32_t aux[4]; - pstore(aux, a); - uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1]; - uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3]; - return aux0 < aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} - -// max -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { - Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1))); -} -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a, a))); -} -template <> -EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0] > aux[1] ? aux[0] : aux[1]; - int aux2 = aux[2] > aux[3] ? aux[2] : aux[3]; - return aux0 > aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} -template <> -EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 uint32_t aux[4]; - pstore(aux, a); - uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1]; - uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3]; - return aux0 > aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} - -// not needed yet -// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) -// { -// return _mm_movemask_ps(x) == 0xF; -// } - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) { - return _mm_movemask_pd(x) != 0x0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - return _mm_movemask_ps(x) != 0x0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) { - return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0; -} - -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) { - return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; -} -template <> -EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) { - return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); } diff --git a/Eigen/src/Core/arch/SSE/Reductions.h b/Eigen/src/Core/arch/SSE/Reductions.h new file mode 100644 index 000000000..dd33da211 --- /dev/null +++ b/Eigen/src/Core/arch/SSE/Reductions.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Charlie Schlosser +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_REDUCTIONS_SSE_H +#define EIGEN_REDUCTIONS_SSE_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +struct sse_add_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return padd(a, b); } +}; + +template +struct sse_mul_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmul(a, b); } +}; + +template +struct sse_min_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmin(a, b); } +}; + +template +struct sse_min_prop_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { + return pmin(a, b); + } +}; + +template +struct sse_max_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmax(a, b); } +}; + +template +struct sse_max_prop_wrapper { + static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { + return pmax(a, b); + } +}; + +template +struct sse_predux_common; + +template +struct sse_predux_impl : sse_predux_common> {}; + +template +struct sse_predux_mul_impl : sse_predux_common> {}; + +template +struct sse_predux_min_impl : sse_predux_common> {}; + +template +struct sse_predux_min_prop_impl : sse_predux_common> {}; + +template +struct sse_predux_max_impl : sse_predux_common> {}; + +template +struct sse_predux_max_prop_impl : sse_predux_common> {}; + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16b -- -- -- -- -- -- -- -- -- -- -- -- */ + +template <> +EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { + Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a)); + return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); +} + +template <> +EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) { + Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a)); + return ((pfirst(tmp) == 0x01010101) && (pfirst(_mm_shuffle_epi32(tmp, 1)) == 0x01010101)); +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4i -- -- -- -- -- -- -- -- -- -- -- -- */ + +template +struct sse_predux_common { + static EIGEN_STRONG_INLINE int run(const Packet4i& a) { + Packet4i tmp; + tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3))); + tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp)); + return _mm_cvtsi128_si32(tmp); + } +}; + +template <> +EIGEN_STRONG_INLINE int predux(const Packet4i& a) { + return sse_predux_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { + return sse_predux_mul_impl::run(a); +} + +#ifdef EIGEN_VECTORIZE_SSE4_1 +template <> +EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { + return sse_predux_min_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { + return sse_predux_max_impl::run(a); +} +#endif + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) { + return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ui -- -- -- -- -- -- -- -- -- -- -- -- */ + +template +struct sse_predux_common { + static EIGEN_STRONG_INLINE uint32_t run(const Packet4ui& a) { + Packet4ui tmp; + tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3))); + tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp)); + return static_cast(_mm_cvtsi128_si32(tmp)); + } +}; + +template <> +EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { + return sse_predux_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) { + return sse_predux_mul_impl::run(a); +} + +#ifdef EIGEN_VECTORIZE_SSE4_1 +template <> +EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) { + return sse_predux_min_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) { + return sse_predux_max_impl::run(a); +} +#endif + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) { + return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2l -- -- -- -- -- -- -- -- -- -- -- -- */ + +template +struct sse_predux_common { + static EIGEN_STRONG_INLINE int64_t run(const Packet2l& a) { + Packet2l tmp; + tmp = Op::packetOp(a, _mm_unpackhi_epi64(a, a)); + return pfirst(tmp); + } +}; + +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { + return sse_predux_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) { + return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4f -- -- -- -- -- -- -- -- -- -- -- -- */ + +template +struct sse_predux_common { + static EIGEN_STRONG_INLINE float run(const Packet4f& a) { + Packet4f tmp; + tmp = Op::packetOp(a, _mm_movehl_ps(a, a)); +#ifdef EIGEN_VECTORIZE_SSE3 + tmp = Op::packetOp(tmp, _mm_movehdup_ps(tmp)); +#else + tmp = Op::packetOp(tmp, _mm_shuffle_ps(tmp, tmp, 1)); +#endif + return _mm_cvtss_f32(tmp); + } +}; + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + return sse_predux_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + return sse_predux_mul_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + return sse_predux_min_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + return sse_predux_max_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { + return _mm_movemask_ps(x) != 0x0; +} + +/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2d -- -- -- -- -- -- -- -- -- -- -- -- */ + +template +struct sse_predux_common { + static EIGEN_STRONG_INLINE double run(const Packet2d& a) { + Packet2d tmp; + tmp = Op::packetOp(a, _mm_unpackhi_pd(a, a)); + return _mm_cvtsd_f64(tmp); + } +}; + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + return sse_predux_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + return sse_predux_mul_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + return sse_predux_min_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + return sse_predux_min_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + return sse_predux_max_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + return sse_predux_max_prop_impl::run(a); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) { + return _mm_movemask_pd(x) != 0x0; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_REDUCTIONS_SSE_H diff --git a/test/redux.cpp b/test/redux.cpp index 42c269ae7..c9c397842 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -37,12 +37,9 @@ void matrixRedux(const MatrixType& m) { m2.array() = m2.array() - kMaxVal * (m2.array() / kMaxVal); } - VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1)); - VERIFY_IS_APPROX( - MatrixType::Ones(rows, cols).sum(), - Scalar(float( - rows * - cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy + VERIFY_IS_EQUAL(MatrixType::Zero(rows, cols).sum(), Scalar(0)); + Scalar sizeAsScalar = internal::cast(rows * cols); + VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), sizeAsScalar); Scalar s(0), p(1), minc(numext::real(m1.coeff(0))), maxc(numext::real(m1.coeff(0))); for (int j = 0; j < cols; j++) for (int i = 0; i < rows; i++) { @@ -160,6 +157,10 @@ EIGEN_DECLARE_TEST(redux) { int maxsize = (std::min)(100, EIGEN_TEST_MAX_SIZE); TEST_SET_BUT_UNUSED_VARIABLE(maxsize); for (int i = 0; i < g_repeat; i++) { + int rows = internal::random(1, maxsize); + int cols = internal::random(1, maxsize); + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(cols); CALL_SUBTEST_1(matrixRedux(Matrix())); CALL_SUBTEST_1(matrixRedux(Array())); CALL_SUBTEST_2(matrixRedux(Matrix2f())); @@ -168,19 +169,37 @@ EIGEN_DECLARE_TEST(redux) { CALL_SUBTEST_3(matrixRedux(Matrix4d())); CALL_SUBTEST_3(matrixRedux(Array4d())); CALL_SUBTEST_3(matrixRedux(Array44d())); - CALL_SUBTEST_4(matrixRedux(MatrixXcf(internal::random(1, maxsize), internal::random(1, maxsize)))); - CALL_SUBTEST_4(matrixRedux(ArrayXXcf(internal::random(1, maxsize), internal::random(1, maxsize)))); - CALL_SUBTEST_5(matrixRedux(MatrixXd(internal::random(1, maxsize), internal::random(1, maxsize)))); - CALL_SUBTEST_5(matrixRedux(ArrayXXd(internal::random(1, maxsize), internal::random(1, maxsize)))); - CALL_SUBTEST_6(matrixRedux(MatrixXi(internal::random(1, maxsize), internal::random(1, maxsize)))); - CALL_SUBTEST_6(matrixRedux(ArrayXXi(internal::random(1, maxsize), internal::random(1, maxsize)))); + CALL_SUBTEST_4(matrixRedux(MatrixXf(rows, cols))); + CALL_SUBTEST_4(matrixRedux(ArrayXXf(rows, cols))); + CALL_SUBTEST_4(matrixRedux(MatrixXd(rows, cols))); + CALL_SUBTEST_4(matrixRedux(ArrayXXd(rows, cols))); + /* TODO: fix test for boolean */ + /*CALL_SUBTEST_5(matrixRedux(MatrixX(rows, cols)));*/ + /*CALL_SUBTEST_5(matrixRedux(ArrayXX(rows, cols)));*/ + CALL_SUBTEST_5(matrixRedux(MatrixXi(rows, cols))); + CALL_SUBTEST_5(matrixRedux(ArrayXXi(rows, cols))); + CALL_SUBTEST_5(matrixRedux(MatrixX(rows, cols))); + CALL_SUBTEST_5(matrixRedux(ArrayXX(rows, cols))); + CALL_SUBTEST_6(matrixRedux(MatrixXcf(rows, cols))); + CALL_SUBTEST_6(matrixRedux(ArrayXXcf(rows, cols))); + CALL_SUBTEST_6(matrixRedux(MatrixXcd(rows, cols))); + CALL_SUBTEST_6(matrixRedux(ArrayXXcd(rows, cols))); } for (int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_7(vectorRedux(Vector4f())); - CALL_SUBTEST_7(vectorRedux(Array4f())); - CALL_SUBTEST_5(vectorRedux(VectorXd(internal::random(1, maxsize)))); - CALL_SUBTEST_5(vectorRedux(ArrayXd(internal::random(1, maxsize)))); - CALL_SUBTEST_8(vectorRedux(VectorXf(internal::random(1, maxsize)))); - CALL_SUBTEST_8(vectorRedux(ArrayXf(internal::random(1, maxsize)))); + int size = internal::random(1, maxsize); + EIGEN_UNUSED_VARIABLE(size); + CALL_SUBTEST_8(vectorRedux(Vector4f())); + CALL_SUBTEST_8(vectorRedux(Array4f())); + CALL_SUBTEST_9(vectorRedux(VectorXf(size))); + CALL_SUBTEST_9(vectorRedux(ArrayXf(size))); + CALL_SUBTEST_10(vectorRedux(VectorXd(size))); + CALL_SUBTEST_10(vectorRedux(ArrayXd(size))); + /* TODO: fix test for boolean */ + /*CALL_SUBTEST_10(vectorRedux(VectorX(size)));*/ + /*CALL_SUBTEST_10(vectorRedux(ArrayX(size)));*/ + CALL_SUBTEST_10(vectorRedux(VectorXi(size))); + CALL_SUBTEST_10(vectorRedux(ArrayXi(size))); + CALL_SUBTEST_10(vectorRedux(VectorX(size))); + CALL_SUBTEST_10(vectorRedux(ArrayX(size))); } }