From 3f3ce214e65505a2ed891bcb5b649c8c5c712649 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 18 Apr 2023 02:38:38 +0000 Subject: [PATCH] New BF16 pcast functions and move type casting to TypeCasting.h --- Eigen/Core | 1 + Eigen/src/Core/arch/AltiVec/PacketMath.h | 127 +-------------- Eigen/src/Core/arch/AltiVec/TypeCasting.h | 178 ++++++++++++++++++++++ 3 files changed, 182 insertions(+), 124 deletions(-) create mode 100644 Eigen/src/Core/arch/AltiVec/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index 4d23920d5..bf0b9c736 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -220,6 +220,7 @@ using std::ptrdiff_t; #include "src/Core/arch/SSE/Complex.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" + #include "src/Core/arch/AltiVec/TypeCasting.h" #include "src/Core/arch/AltiVec/MathFunctions.h" #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index e0168513d..5f4ccda5e 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -2697,102 +2697,6 @@ template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, c return vec_sel(elsePacket, thenPacket, mask); } -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return vec_cts(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { - return vec_ctu(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return vec_ctf(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { - return vec_ctf(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8bf& a) { - Packet4f float_even = Bf16ToF32Even(a); - Packet4f float_odd = Bf16ToF32Odd(a); - Packet4ui int_even = pcast(float_even); - Packet4ui int_odd = pcast(float_odd); - const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); - Packet4ui low_even = pand(int_even, p4ui_low_mask); - Packet4ui low_odd = pand(int_odd, p4ui_low_mask); - - //Check values that are bigger than USHRT_MAX (0xFFFF) - Packet4bi overflow_selector; - if(vec_any_gt(int_even, p4ui_low_mask)){ - overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); - low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); - } - if(vec_any_gt(int_odd, p4ui_low_mask)){ - overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); - low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); - } - - return pmerge(low_even, low_odd); -} - -template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8us& a) { - //short -> int -> float -> bfloat16 - const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); - Packet4ui int_cast = reinterpret_cast(a); - Packet4ui int_even = pand(int_cast, p4ui_low_mask); - Packet4ui int_odd = plogical_shift_right<16>(int_cast); - Packet4f float_even = pcast(int_even); - Packet4f float_odd = pcast(int_odd); - return F32ToBf16(float_even, float_odd); -} - - -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return reinterpret_cast(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return reinterpret_cast(a); -} - - //---------- double ---------- #ifdef EIGEN_VECTORIZE_VSX @@ -2805,7 +2709,6 @@ typedef Packet2ul Packet2bl; typedef __vector __bool long Packet2bl; #endif -static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; @@ -3082,34 +2985,10 @@ template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) return reinterpret_cast(vec_perm(tmp, tmp, p16uc_DUPSIGN)); } #endif -// VSX support varies between different compilers and even different -// versions of the same compiler. For gcc version >= 4.9.3, we can use -// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use -// a slow version that works with older compilers. -// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles -// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 -template<> -inline Packet2l pcast(const Packet2d& x) { -#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0) - return vec_cts(x, 0); // TODO: check clang version. -#else - double tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2l l = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return l; -#endif -} -template<> -inline Packet2d pcast(const Packet2l& x) { - unsigned long long tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2d d = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return d; -} +template<> inline Packet2l pcast(const Packet2d& x); +template<> inline Packet2d pcast(const Packet2l& x); // Packet2l shifts. // For POWER8 we simply use vec_sr/l. @@ -3290,7 +3169,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); + Packet2ul mask = reinterpret_cast(pnegate(reinterpret_cast(select))); return vec_sel(elsePacket, thenPacket, mask); } diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h new file mode 100644 index 000000000..bda63d854 --- /dev/null +++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h @@ -0,0 +1,178 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Rasmus Munk Larsen +// Copyright (C) 2023 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_ALTIVEC_H +#define EIGEN_TYPE_CASTING_ALTIVEC_H + +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vec_cts(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vec_ctu(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8bf& a) { + Packet4f float_even = Bf16ToF32Even(a); + Packet4f float_odd = Bf16ToF32Odd(a); + Packet4ui int_even = pcast(float_even); + Packet4ui int_odd = pcast(float_odd); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui low_even = pand(int_even, p4ui_low_mask); + Packet4ui low_odd = pand(int_odd, p4ui_low_mask); + + //Check values that are bigger than USHRT_MAX (0xFFFF) + Packet4bi overflow_selector; + if(vec_any_gt(int_even, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); + low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + if(vec_any_gt(int_odd, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); + low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + + return pmerge(low_even, low_odd); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8us& a) { + //short -> int -> float -> bfloat16 + const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui int_cast = reinterpret_cast(a); + Packet4ui int_even = pand(int_cast, p4ui_low_mask); + Packet4ui int_odd = plogical_shift_right<16>(int_cast); + Packet4f float_even = pcast(int_even); + Packet4f float_odd = pcast(int_odd); + return F32ToBf16(float_even, float_odd); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet8bf& a) { + Packet8us z = pset1(0); +#ifdef _BIG_ENDIAN + return reinterpret_cast(vec_mergeh(a.m_val, z)); +#else + return reinterpret_cast(vec_mergeh(z, a.m_val)); +#endif +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet4f& a, const Packet4f &b) { + return F32ToBf16Both(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return reinterpret_cast(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return reinterpret_cast(a); +} + +#ifdef EIGEN_VECTORIZE_VSX +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +template<> +inline Packet2l pcast(const Packet2d& x) { +#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} + +template<> +inline Packet2d pcast(const Packet2l& x) { + unsigned long long tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2d d = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return d; +} +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_ALTIVEC_H