From dcceb9afec3be00dda399a622a5f6c2e520a3906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 26 Apr 2024 15:28:03 +0000 Subject: [PATCH] Unbork avx512 preduce_mul on MSVC. --- Eigen/src/Core/arch/AVX512/PacketMath.h | 18 ++++++++++++++++++ test/packetmath.cpp | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index aa338d112..8f7662f47 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1562,10 +1562,28 @@ template <> EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) { return _mm512_reduce_mul_epi32(a); } + +#if EIGEN_COMP_MSVC +// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939. +// alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 }; +// int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data)); +// produces garbage: 4294967295. It seems to happen whenever the output is supposed to be negative. +// Fall back to a manual approach: +template <> +EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { + Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0); + Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1); + Packet4l res = pmul(lane0, lane1); + res = pmul(res, Packet4l(_mm256_permute2x128_si256(res, res, 1))); + res = pmul(res, Packet4l(_mm256_shuffle_epi32(res, 0xE))); + return pfirst(res); +} +#else template <> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) { return _mm512_reduce_mul_epi64(a); } +#endif template <> EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 8bfa32196..b9a06b869 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -623,6 +623,15 @@ void packetmath() { VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4"); } + // Avoid overflows. + if (NumTraits::IsInteger && NumTraits::IsSigned) { + Scalar limit = + static_cast(std::pow(static_cast(numext::real(NumTraits::highest())), + 1.0 / static_cast(Eigen::internal::unpacket_traits::size))); + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(-limit, limit); + } + } ref[0] = Scalar(1); for (int i = 0; i < PacketSize; ++i) ref[0] = REF_MUL(ref[0], data1[i]); VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload(data1))) && "internal::predux_mul");