From d194167149a2f18d51ea07ed6e87d2d51744c537 Mon Sep 17 00:00:00 2001 From: Lianhuang Li Date: Thu, 1 Dec 2022 17:28:57 +0000 Subject: [PATCH] Fix the bug using neon instruction fmla for data type half --- .../Core/arch/NEON/GeneralBlockPanelKernel.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index e49e3947a..00bdb9b8a 100644 --- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -183,7 +183,11 @@ struct gebp_traits } }; -#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not +// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]', +// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented +// through a costly dup in gcc compiler. +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG template<> struct gebp_traits @@ -240,19 +244,10 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { - #if EIGEN_COMP_GNUC_STRICT - // 1. vfmaq_lane_f16 is implemented through a costly dup - // 2. workaround the gcc register split problem on arm64-neon - if(LaneID==0) asm("fmla %0.8h, %1.8h, %2.h[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==1) asm("fmla %0.8h, %1.8h, %2.h[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==2) asm("fmla %0.8h, %1.8h, %2.h[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==3) asm("fmla %0.8h, %1.8h, %2.h[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); - #else c = vfmaq_lane_f16(c, a, b, LaneID); - #endif } }; -#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG #endif // EIGEN_ARCH_ARM64 } // namespace internal