From 07db964bdeb78234667266e61dd96873a5eca56f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 14 Apr 2022 16:58:32 +0000 Subject: [PATCH] Restrict new AVX512 trsm to AVX512VL, rename files for consistency. --- Eigen/Core | 4 +++- Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/AVX512/PacketMath.h | 4 +++- .../arch/AVX512/{trsmKernel_impl.hpp => TrsmKernel.h} | 5 +++-- .../arch/AVX512/{unrolls_impl.hpp => TrsmUnrolls.inc} | 0 test/triangular.cpp | 2 +- 6 files changed, 14 insertions(+), 9 deletions(-) rename Eigen/src/Core/arch/AVX512/{trsmKernel_impl.hpp => TrsmKernel.h} (99%) rename Eigen/src/Core/arch/AVX512/{unrolls_impl.hpp => TrsmUnrolls.inc} (100%) diff --git a/Eigen/Core b/Eigen/Core index 5f5ccc04f..e76ba599d 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -190,7 +190,9 @@ using std::ptrdiff_t; #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" - #include "src/Core/arch/AVX512/trsmKernel_impl.hpp" + #ifdef __AVX512VL__ + #include "src/Core/arch/AVX512/TrsmKernel.h" + #endif #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 21e26d4e3..c12eb0e9e 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -237,7 +237,7 @@ template<> struct unpacket_traits { typedef Packet8i integer_packet; typedef uint8_t mask_t; enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true -#ifdef EIGEN_VECTORIZE_AVX512 +#ifdef __AVX512VL__ , masked_fpops_available=true #endif }; @@ -468,7 +468,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { r template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } -#ifdef EIGEN_VECTORIZE_AVX512 +#ifdef __AVX512VL__ template <> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b, uint8_t umask) { __mmask8 mask = static_cast<__mmask8>(umask); @@ -859,7 +859,7 @@ template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { E template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { -#ifdef EIGEN_VECTORIZE_AVX512 +#ifdef __AVX512VL__ __mmask8 mask = static_cast<__mmask8>(umask); EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskz_loadu_ps(mask, from); #else @@ -927,7 +927,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { -#ifdef EIGEN_VECTORIZE_AVX512 +#ifdef __AVX512VL__ __mmask8 mask = static_cast<__mmask8>(umask); EIGEN_DEBUG_UNALIGNED_STORE return _mm256_mask_storeu_ps(to, mask, from); #else diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index d34e04873..337001b13 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -247,7 +247,7 @@ template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { #if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0) // Inline asm here helps reduce some register spilling in TRSM kernels. - // See note in unrolls::gemm::microKernel in trsmKernel_impl.hpp + // See note in unrolls::gemm::microKernel in TrsmKernel.h Packet16f ret; __asm__ ("vbroadcastss %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from)); return ret; @@ -300,6 +300,7 @@ EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, const Packet16i& b) { return _mm512_add_epi32(a, b); } + template <> EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, const Packet16f& b, @@ -800,6 +801,7 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( reinterpret_cast(from)); } + template <> EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umask) { __mmask16 mask = static_cast<__mmask16>(umask); diff --git a/Eigen/src/Core/arch/AVX512/trsmKernel_impl.hpp b/Eigen/src/Core/arch/AVX512/TrsmKernel.h similarity index 99% rename from Eigen/src/Core/arch/AVX512/trsmKernel_impl.hpp rename to Eigen/src/Core/arch/AVX512/TrsmKernel.h index 139c69f2b..4b81bf915 100644 --- a/Eigen/src/Core/arch/AVX512/trsmKernel_impl.hpp +++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h @@ -38,8 +38,9 @@ typedef Packet8d vecFullDouble; typedef Packet8f vecHalfFloat; typedef Packet4d vecHalfDouble; -// Compile-time unrolls are implemented here -#include "unrolls_impl.hpp" +// Compile-time unrolls are implemented here. +// Note: this depends on macros and typedefs above. +#include "TrsmUnrolls.inc" #if defined(EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0) diff --git a/Eigen/src/Core/arch/AVX512/unrolls_impl.hpp b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc similarity index 100% rename from Eigen/src/Core/arch/AVX512/unrolls_impl.hpp rename to Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc diff --git a/test/triangular.cpp b/test/triangular.cpp index eae0ea0b0..22598696c 100644 --- a/test/triangular.cpp +++ b/test/triangular.cpp @@ -7,7 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifdef EIGEN_TEST_PART_100 +#if defined(EIGEN_TEST_PART_100) || defined(EIGEN_TEST_PART_ALL) # define EIGEN_NO_DEPRECATED_WARNING #endif