mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-29 23:34:12 +08:00
Remove old Clang compiler bug work-arounds. The two LLVM bugs referenced in the comments here have long been fixed. The workarounds were now detrimental because (1) they prevented using fused mul-add on Clang/ARM32 and (2) the unnecessary 'volatile' in 'asm volatile' prevented legitimate reordering by the compiler.
This commit is contained in:
parent
bb56a62582
commit
cc0c38ace8
@ -1010,17 +1010,8 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/,
|
|||||||
return pset1<Packet2ul>(0ULL);
|
return pset1<Packet2ul>(0ULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
|
|
||||||
// then implements a slow software scalar fallback calling fmaf()!
|
#ifdef __ARM_FEATURE_FMA
|
||||||
// Filed LLVM bug:
|
|
||||||
// https://llvm.org/bugs/show_bug.cgi?id=27216
|
|
||||||
#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
|
|
||||||
// See bug 936.
|
|
||||||
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
|
|
||||||
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
|
|
||||||
// MLA is not fused i.e. does 2 roundings.
|
|
||||||
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
|
|
||||||
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
||||||
{ return vfmaq_f32(c,a,b); }
|
{ return vfmaq_f32(c,a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
|
template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
|
||||||
@ -1028,25 +1019,7 @@ template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f&
|
|||||||
#else
|
#else
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
||||||
{
|
{
|
||||||
#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
|
|
||||||
// Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
|
|
||||||
// at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
|
|
||||||
// -march=armv7-a, that is a very common case.
|
|
||||||
// See e.g. this thread:
|
|
||||||
// http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
|
|
||||||
// Filed LLVM bug:
|
|
||||||
// https://llvm.org/bugs/show_bug.cgi?id=27219
|
|
||||||
Packet4f r = c;
|
|
||||||
asm volatile(
|
|
||||||
"vmla.f32 %q[r], %q[a], %q[b]"
|
|
||||||
: [r] "+w" (r)
|
|
||||||
: [a] "w" (a),
|
|
||||||
[b] "w" (b)
|
|
||||||
: );
|
|
||||||
return r;
|
|
||||||
#else
|
|
||||||
return vmlaq_f32(c,a,b);
|
return vmlaq_f32(c,a,b);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
|
template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user