From 66d073c38e3cd5dad974deea7b3d1d45247ea55b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?= Date: Fri, 9 Aug 2019 15:56:26 -0600 Subject: [PATCH 01/30] bug #1718: Add cast to successfully compile with clang on PowerPC Ignoring -Wc11-extensions warnings thrown by clang at Altivec/PacketMath.h --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +- Eigen/src/Core/util/DisableStupidWarnings.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 4b770d036..f3d374a62 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -452,7 +452,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { - return vec_sel(b, a, mask); + return vec_sel(b, a, reinterpret_cast(mask)); } template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 6c7c2d655..4501d3248 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -44,6 +44,11 @@ #if __clang_major__ >= 3 && __clang_minor__ >= 5 #pragma clang diagnostic ignored "-Wabsolute-value" #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif #elif defined __GNUC__ From 4d29aa0294a0d0aa21c41eef687840a5c59bf692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?= Date: Fri, 9 Aug 2019 15:59:26 -0600 Subject: [PATCH 02/30] Fix offset argument of ploadu/pstoreu for Altivec If no offset is given, them it should be zero. Also passes full address to vec_vsx_ld/st builtins. Removes userless _EIGEN_ALIGNED_PTR & _EIGEN_MASK_ALIGNMENT. Removes unnecessary casts. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index f3d374a62..1fef285ce 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -83,15 +83,6 @@ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; -// Mask alignment -#ifdef __PPC64__ -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#else -#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 -#endif - -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) - // Handle endianness properly while loading constants // Define global static constants: #ifdef _BIG_ENDIAN @@ -487,12 +478,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); + return vec_vsx_ld(0, from); } template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); + return vec_vsx_ld(0, from); } #endif @@ -553,12 +544,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); + vec_vsx_st(from, 0, to); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + vec_vsx_st(from, 0, to); } #endif @@ -1045,7 +1036,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { re template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD - return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); + return vec_vsx_ld(0, from); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -1059,7 +1050,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + vec_vsx_st(from, 0, to); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } From 787f6ef0254949380cc6955890eeb9c282c2350f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?= Date: Fri, 9 Aug 2019 16:02:55 -0600 Subject: [PATCH 03/30] Fix packed load/store for PowerPC's VSX The vec_vsx_ld/vec_vsx_st builtins were wrongly used for aligned load/store. In fact, they perform unaligned memory access and, even when the address is 16-byte aligned, they are much slower (at least 2x) than their aligned counterparts. For double/Packet2d vec_xl/vec_xst should be prefered over vec_ld/vec_st, although the latter works when casted to float/Packet4f. Silencing some weird warning with throw but some GCC versions. Such warning are not thrown by Clang. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 40 +++++++++--------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 1fef285ce..30694d424 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -240,42 +240,38 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) // Need to define them first or we get specialization after instantiation errors template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else return vec_ld(0, from); -#endif } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else return vec_ld(0, from); -#endif } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else vec_st(from, 0, to); -#endif } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else vec_st(from, 0, to); -#endif } template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { @@ -940,21 +936,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return vec_xl(0, const_cast(from)); // cast needed by Clang } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { From a3298b22ecd19a80a2fc03df3d463fdb04907c87 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 12 Aug 2019 13:53:28 -0700 Subject: [PATCH 04/30] Implement vectorized versions of log1p and expm1 in Eigen using Kahan's formulas, and change the scalar implementations to properly handle infinite arguments. Depending on instruction set, significant speedups are observed for the vectorized path: log1p wall time is reduced 60-93% (2.5x - 15x speedup) expm1 wall time is reduced 0-85% (1x - 7x speedup) The scalar path is slower by 20-30% due to the extra branch needed to handle +infinity correctly. Full benchmarks measured on Intel(R) Xeon(R) Gold 6154 here: https://bitbucket.org/snippets/rmlarsen/MXBkpM --- Eigen/src/Core/MathFunctions.h | 8 +++- Eigen/src/Core/arch/AVX/MathFunctions.h | 10 ++++ Eigen/src/Core/arch/AVX/PacketMath.h | 2 + Eigen/src/Core/arch/AVX512/MathFunctions.h | 12 +++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 2 + .../arch/Default/GenericPacketMathFunctions.h | 46 +++++++++++++++++++ Eigen/src/Core/arch/SSE/MathFunctions.h | 13 +++++- Eigen/src/Core/arch/SSE/PacketMath.h | 2 + test/packetmath.cpp | 6 ++- 9 files changed, 95 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 8bef59354..1eeb2752b 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -501,7 +501,8 @@ namespace std_fallback { } EIGEN_USING_STD_MATH(log); - return (u - RealScalar(1)) * x / log(u); + Scalar logu = log(u); + return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; } } @@ -548,7 +549,10 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + Scalar log_1p = log(x1p); + const bool is_inf = numext::equal_strict(x1p, log_1p); + const bool is_small = numext::equal_strict(x1p, Scalar(1)); + return (is_inf || is_small) ? x : x * (log_1p / (x1p - RealScalar(1))); } } diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 9f375ed98..c6d3cf6a0 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -36,6 +36,16 @@ plog(const Packet8f& _x) { return plog_float(_x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f plog1p(const Packet8f& _x) { + return generic_plog1p(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f pexpm1(const Packet8f& _x) { + return generic_expm1(_x); +} + // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 7ee9dee10..5233195f3 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -65,6 +65,8 @@ template<> struct packet_traits : default_packet_traits HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index c2158c538..9e37a720b 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -393,6 +393,18 @@ pcos(const Packet16f& _x) { return pcos_float(_x); } +#if defined(EIGEN_VECTORIZE_AVX512DQ) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f plog1p(const Packet16f& _x) { + return generic_plog1p(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f pexpm1(const Packet16f& _x) { + return generic_expm1(_x); +} +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 383c49636..0a81fe02d 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -60,6 +60,8 @@ template<> struct packet_traits : default_packet_traits #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, #endif HasExp = 1, HasSqrt = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 43e827638..640aae05a 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -126,6 +126,52 @@ Packet plog_float(const Packet _x) por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); } +/** \internal \returns log(1 + x) computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_plog1p(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + Packet xp1 = padd(x, one); + Packet small_mask = pcmp_eq(xp1, one); + Packet log1 = plog(xp1); + // Add a check to handle x == +inf. + Packet pos_inf_mask = pcmp_eq(x, log1); + Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); + return pselect(por(small_mask, pos_inf_mask), x, log_large); +} + +/** \internal \returns exp(x)-1 computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_expm1(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + const Packet neg_one = pset1(ScalarType(-1)); + Packet u = pexp(x); + Packet one_mask = pcmp_eq(u, one); + Packet u_minus_one = psub(u, one); + Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one); + Packet logu = plog(u); + // The following comparison is to catch the case where + // exp(x) = +inf. It is written in this way to avoid having + // to form the constant +inf, which depends on the packet + // type. + Packet pos_inf_mask = pcmp_eq(logu, u); + Packet expm1 = pmul(u_minus_one, pdiv(x, logu)); + expm1 = pselect(pos_inf_mask, u, expm1); + return pselect(one_mask, + x, + pselect(neg_one_mask, + neg_one, + expm1)); +} + + // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 0d491ab88..02c8f3c2f 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -22,11 +22,20 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ +Packet4f plog(const Packet4f& _x) { return plog_float(_x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog1p(const Packet4f& _x) { + return generic_plog1p(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexpm1(const Packet4f& _x) { + return generic_expm1(_x); +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 0d571ce61..94603dd55 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,6 +110,8 @@ template<> struct packet_traits : default_packet_traits HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, diff --git a/test/packetmath.cpp b/test/packetmath.cpp index f1448f335..41000a842 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -604,11 +604,13 @@ template void packetmath_real() CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); #if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L) - CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); - CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p); CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); + data1[0] = std::numeric_limits::infinity(); + data1[1] = std::numeric_limits::denorm_min(); + CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); + CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p); #endif if(PacketSize>=2) From db9147ae40695e43ec694b2e207d0acc5b0570d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?= Date: Wed, 14 Aug 2019 10:37:39 -0600 Subject: [PATCH 05/30] Add missing pcmp_XX methods for double/Packet2d This actually fixes an issue in unit-test packetmath_2 with pcmp_eq when it is compiled with clang. When pcmp_eq(Packet4f,Packet4f) is used instead of pcmp_eq(Packet2d,Packet2d), the unit-test does not pass due to NaN on ref vector. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 30694d424..521e6076d 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1009,6 +1009,14 @@ template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const return ret; } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { + Packet2d c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} + template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } From 5ac7984ffa2076cc5b26fb220a3b351951251c2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?= Date: Wed, 14 Aug 2019 11:59:12 -0600 Subject: [PATCH 06/30] Fix debug macros in p{load,store}u --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 521e6076d..7ee290a29 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -539,12 +539,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f // We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_ALIGNED_STORE + EIGEN_DEBUG_UNALIGNED_STORE vec_vsx_st(from, 0, to); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { - EIGEN_DEBUG_ALIGNED_STORE + EIGEN_DEBUG_UNALIGNED_STORE vec_vsx_st(from, 0, to); } #endif @@ -1031,7 +1031,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { re template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD + EIGEN_DEBUG_UNALIGNED_LOAD return vec_vsx_ld(0, from); } @@ -1045,7 +1045,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_DEBUG_ALIGNED_STORE + EIGEN_DEBUG_UNALIGNED_STORE vec_vsx_st(from, 0, to); } From 071311821e509d87bec609d6a3aeea9dc74cfd66 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 19 Aug 2019 11:44:25 -0700 Subject: [PATCH 07/30] Remove XSMM support from Tensor module --- cmake/FindXsmm.cmake | 25 -- unsupported/Eigen/CXX11/Tensor | 4 - .../CXX11/src/Tensor/TensorContraction.h | 281 ------------------ .../src/Tensor/TensorContractionBlocking.h | 135 --------- .../src/Tensor/TensorContractionThreadPool.h | 201 ------------- unsupported/test/CMakeLists.txt | 11 - unsupported/test/cxx11_tensor_contraction.cpp | 7 - 7 files changed, 664 deletions(-) delete mode 100644 cmake/FindXsmm.cmake diff --git a/cmake/FindXsmm.cmake b/cmake/FindXsmm.cmake deleted file mode 100644 index 809d6f414..000000000 --- a/cmake/FindXsmm.cmake +++ /dev/null @@ -1,25 +0,0 @@ -# libxsmm support. -# libxsmm provides matrix multiplication kernels optimized for -# the latest Intel architectures. -# Download the library from https://github.com/hfp/libxsmm -# Compile with make BLAS=0 - -if (LIBXSMM) - set(XSMM_FIND_QUIETLY TRUE) - set(XSMM_INCLUDES ${LIBXSMM}/include) - set(XSMM_LIBRARIES ${LIBXSMM}/lib) -endif (LIBXSMM) - -find_path(LIBXSMM - NAMES - libxsmm.h - PATHS - $ENV{XSMMDIR}/include - ${INCLUDE_INSTALL_DIR} -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(XSMM DEFAULT_MSG - LIBXSMM) - -mark_as_advanced(LIBXSMM) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 25b663046..5d18aeb3f 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -73,10 +73,6 @@ typedef unsigned __int64 uint64_t; #include #endif -#if defined(EIGEN_USE_LIBXSMM) -#include "libxsmm.h" -#endif - #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index de7c2248a..a398b2b3f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,70 +20,6 @@ namespace Eigen { * */ namespace internal { -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) -template -void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { - size_t psize = packet_traits::size; // Packet size - typedef typename packet_traits::type Packet; // Packet type - size_t alignment = psize*sizeof(Scalar); // Needed alignment - if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && - (ldsrc*sizeof(Scalar)) % alignment == 0 && - reinterpret_cast(src) % alignment == 0 && - reinterpret_cast(dst) % alignment == 0) { - // Optimized version using packets - size_t num_packets = rows / psize; - for (Index col = 0; col < cols; ++col) { - EIGEN_ASM_COMMENT("begin pack_simple inner copy"); - // Unrolled manually 4 times. - for (size_t i=0; i < num_packets/4; ++i) { - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - } - for (size_t i=0; i < num_packets%4; ++i) { - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - } - dst += lddst - num_packets*psize; - src += ldsrc - num_packets*psize; - EIGEN_ASM_COMMENT("end pack_simple inner copy"); - } - } else { - // Naive memcpy calls - for (Index col = 0; col < cols; ++col) { - memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); - } - } -} - -template - struct libxsmm_wrapper { - libxsmm_wrapper() {} - libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} - }; - - template<> - struct libxsmm_wrapper: public libxsmm_mmfunction { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; - - template<> - struct libxsmm_wrapper: public libxsmm_mmfunction { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; -#endif - template struct traits > @@ -640,8 +576,6 @@ struct TensorContractionEvaluatorBase } } - EnableXSMMIfPossible(eval_op_indices); - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -780,13 +714,6 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC #endif void evalGemm(Scalar* buffer) const { - #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (m_can_use_xsmm) { - evalGemmXSMM(buffer); - return; - } - #endif - // columns in left side, rows in right side const Index k = this->m_k_size; @@ -942,213 +869,6 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array, ContractDims>& eval_op_indices) { - m_can_use_xsmm = false; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - if (!std::is_same::value || - !std::is_same::value || - !(std::is_same::value || - std::is_same::value) || - m_leftImpl.data() == NULL || - m_rightImpl.data() == NULL) { - return; - } - - // Check if we can use faster matmul algorithms. For contraction to be - // equivalent to matmul, we need both lhs and rhs contracting dims sequences - // to be either a prefix or suffix of all dims. Also, the order of both - // must be the same, so we don't have to do reordering. - // For example: - // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] - // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] - // Depending if contraction dims are prefix or suffix of all dims we need to - // pre-transpose matrices in matmul algorithm: - // lhs: prefix -> transpose, suffix -> no transpose - // rhs: prefix -> no transpose, suffix -> transpose - // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, - // non-transposed matmul. - if (ContractDims == 0) { - // This case is totally uninteresting, filter it out to avoid problems - // with iterations in further tests. - return; - } - - // Check if RHS dims list is increasing. LHS already is, so if not, the - // order is different and we cannot do matmul. - for (int i = 1; i < ContractDims; i++) { - if (eval_op_indices[i].second < eval_op_indices[i-1].second) { - return; - } - } - - // Check if no holes. - int diff; - for (int i = 1; i < ContractDims; i++) { - // LHS contract dims are sorted to form an increasing seq. - diff = eval_op_indices[i].first - eval_op_indices[i-1].first; - if (diff != 1) { - return; - } - // Now we may already assume RHS contract dims seq is increasing too. - diff = eval_op_indices[i].second - eval_op_indices[i-1].second; - if (diff != 1) { - return; - } - } - - // Check if suffix or prefix. - if (eval_op_indices[0].first != 0 && - eval_op_indices[ContractDims-1].first != LDims-1) { - return; - } - if (eval_op_indices[0].second != 0 && - eval_op_indices[ContractDims-1].second != RDims-1) { - return; - } - - m_can_use_xsmm = true; -#else - EIGEN_UNUSED_VARIABLE(eval_op_indices); -#endif - } - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - const bool transposeA = !m_lhs_inner_dim_contiguous; - const bool transposeB = !m_rhs_inner_dim_contiguous; - - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - internal::TensorXsmmContractionBlocking blocking( - k, m, n, 1, transposeA, transposeB); - - // Outer blocks sizes - const Index mc_outer = blocking.outer_m(); - const Index nc_outer = blocking.outer_n(); - const Index kc_outer = blocking.outer_k(); - // Inner blocks sizes - const Index mc = blocking.mc(); - const Index nc = blocking.nc(); - const Index kc = blocking.kc(); - // Decisions whether we should copy parts of matrices - const bool copyA = blocking.copyA(); - const bool copyB = blocking.copyB(); - - const LhsScalar* leftData = m_leftImpl.data(); - const RhsScalar* rightData = m_rightImpl.data(); - - const libxsmm_blasint stride_A = static_cast(transposeA ? k : m); - const libxsmm_blasint stride_B = static_cast(transposeB ? n : k); - const libxsmm_blasint stride_C = static_cast(m); - - const libxsmm_blasint stride_blockA = static_cast(mc); - // Use bigger stride to avoid hitting same cache line too often. - // This consistently gives +~0.5 Gflops. - const libxsmm_blasint stride_panelB = static_cast( - kc % 32 == 0 ? kc + 16 : kc - ); - - // Kernel for the general case (not edges) - internal::libxsmm_wrapper kernel; - - LhsScalar* blockA = NULL; - RhsScalar* panelB = NULL; - - if (copyA) { - blockA = static_cast(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); - } - if (copyB) { - panelB = static_cast(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); - } - - const Index kernel_stride_A = copyA ? stride_blockA : stride_A; - const Index kernel_stride_B = copyB ? stride_panelB : stride_B; - kernel = internal::libxsmm_wrapper(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); - - // Outer blocking - for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { - for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { - for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { - using numext::mini; - - Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; - - // Inner blocking - for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { - const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; - const float beta = ki == 0 ? 0 : 1; - - if (copyB) { - if (transposeB) { - libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); - } else { - internal::pack_simple(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); - } - } - - for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { - const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; - - const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : - leftData + ki*stride_A + mi; - - if (copyA) { - if (transposeA) { - libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); - } else { - internal::pack_simple(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); - } - } - const LhsScalar* actual_a = copyA ? blockA : a; - - for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { - const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; - - const RhsScalar* b = rightData + ni*stride_B + ki; - Scalar* c = buffer + ni*stride_C + mi; - const Scalar* cp = c + nc*stride_C; - - const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; - const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; - - if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { - // Most used, cached kernel. - kernel(actual_a, actual_b, c, actual_a, bp, cp); - } else { - // Edges - use libxsmm kernel cache. - internal::libxsmm_wrapper(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); - } - } - } - } - } - } - } - - if (copyA) { - this->m_device.deallocate(blockA); - } - if (copyB) { - this->m_device.deallocate(panelB); - } - } -#endif - // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -1177,7 +897,6 @@ protected: const Device EIGEN_DEVICE_REF m_device; OutputKernelType m_output_kernel; EvaluatorPointerType m_result; - bool m_can_use_xsmm; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index c51f3f8dd..974feb0ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -67,141 +67,6 @@ class TensorContractionBlocking { StorageIndex nc_; }; - - -#if defined(EIGEN_USE_LIBXSMM) -template -class TensorXsmmContractionBlocking { - public: - TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, - size_t max_num_threads = 1, bool transposeA = false, - bool transposeB = false): - k_(k), m_(m), n_(n), transposeA_(transposeA), - transposeB_(transposeB), num_threads_(max_num_threads) { -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; - kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; - nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; - outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; - outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; - outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; - copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; - copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; - outer_m_ = outer_m_ != 0 ? outer_m_ : m; - outer_k_ = outer_k_ != 0 ? outer_k_ : k; - outer_n_ = outer_n_ != 0 ? outer_n_ : n; - } -#else - // Defaults, possibly overridden per-platform. - copyA_ = true; - copyB_ = false; - - // If the matrix is small enough, don't do blocking, just call single xsmm - // kernel. - if (static_cast(m)*k*n <= LIBXSMM_THRESHOLD) { - mc_ = m; kc_ = k; nc_ = n; - outer_m_ = m; outer_k_ = k; outer_n_ = n; - copyA_ = false; copyB_ = false; - } else { - int arch = libxsmm_cpuid_x86(); - - if (arch == LIBXSMM_X86_AVX512_CORE) { - // skylake - mc_ = 64; kc_ = 64; nc_ = 24; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; - // Hack to use this kernel architecture as the other one has performance - // issues (no hardware prefetching). - // TODO(nishantpatil): This should be removed if the issues are fixed, - // or this one becomes the default. - setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); - } else if (arch == LIBXSMM_X86_AVX2) { - // haswell - mc_ = 32; kc_ = 192; nc_ = 33; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; - } else if (arch == LIBXSMM_X86_AVX) { - // ivybridge - mc_ = 32; kc_ = 192; nc_ = 48; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; - } else { - // generic kernel size, usually performing well - mc_ = 32; kc_ = 128; nc_ = 32; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; - } - - // Only copy if it makes the stride smaller. - copyA_ = copyA_ && (m > mc_); - copyB_ = copyB_ && (k > kc_); - } - - // We need to copy anyway if transposing - copyA_ = copyA_ || transposeA; - copyB_ = copyB_ || transposeB; - - // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h - prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; - -#endif - - mc_ = mc_ > m ? m : mc_; - nc_ = nc_ > n ? n : nc_; - kc_ = kc_ > k ? k : kc_; - - size_t compute_parallelism = (m / mc_) * (n / nc_); - size_t pack_parallelism = 0; - if (copyA_) { - pack_parallelism += (m / mc_) * (k / kc_); - } - if (copyB_) { - pack_parallelism += (n / nc_) * (k / kc_); - } - size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); - - num_threads_ = numext::mini(num_threads_, - parallelism / MIN_JOBS_PER_THREAD); - num_threads_ = numext::maxi(num_threads_, 1); - - // For optimal performance outer block sizes should be multiplies of kernel - // sizes, or bigger than matrix size (=no outer blocking). - eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); - eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); - eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); - } - - EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } - EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } - EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; } - EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } - EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } - EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } - EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } - EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); } - EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { - return prefetch_; - } - - private: - StorageIndex k_, m_, n_; - StorageIndex kc_, mc_, nc_; - StorageIndex outer_k_, outer_m_, outer_n_; - bool copyA_, copyB_, transposeA_, transposeB_; - size_t num_threads_; - - // Threshold for m*k*n to skip blocking and just call libxsmm - const double LIBXSMM_THRESHOLD = 80*80*80; - // For computing optimal number of threads - so that each thread gets at least - // that many jobs. - const double MIN_JOBS_PER_THREAD = 3; - libxsmm_gemm_prefetch_type prefetch_; -}; -#endif // EIGEN_USE_LIBXSMM - } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 22db6f01b..ca20038a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -78,23 +78,6 @@ struct TensorEvaluatorm_k_size; if (m == 0 || n == 0 || k == 0) return; -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (this->m_can_use_xsmm) { - bool transposeA = !this->m_lhs_inner_dim_contiguous; - bool transposeB = !this->m_rhs_inner_dim_contiguous; - internal::TensorXsmmContractionBlocking - blocking(k, m, n, this->m_device.numThreads(), transposeA, - transposeB); - - if (blocking.num_threads() == 1) { - this->evalGemmXSMM(buffer); - } else { - ContextXsmm(this, buffer, m, n, k, blocking).run(); - } - return; - } -#endif - // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) // - task grain sizes (number of kernels executed per task: gm, gn) @@ -1227,190 +1210,6 @@ struct TensorEvaluator::value, - "XSMM does not support contraction output kernels."); - - template - class ContextXsmm { - public: - ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k, - const internal::TensorXsmmContractionBlocking& blocking): - device(self->m_device), - m(m), k(k), n(n), - stride_a(blocking.transposeA() ? k : m), - stride_b(blocking.transposeB() ? n : k), - stride_c(m), - bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()), - blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()), - blocks_n(blocking.blocks_n()), - copyA(blocking.copyA()), copyB(blocking.copyB()), - transposeA(blocking.transposeA()), transposeB(blocking.transposeB()), - num_threads(blocking.num_threads()), - buffer(buffer), - leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()), - workers_done(blocking.num_threads()), - - packingA_jobs(0), packingB_jobs(0), compute_jobs(0), - packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {} - - void worker() { - // Pack - - if (copyA) { - while (true) { - uint32_t mk = packingA_jobs++; - Index mi = mk / blocks_k; - Index ki = mk % blocks_k; - if (mi >= blocks_m) break; - - LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki); - if (transposeA) { - const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki); - libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki), - actual_bm(mi), stride_a, bm); - } else { - const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi); - internal::pack_simple(blockA, current_a, - actual_bk(ki), actual_bm(mi), bm, stride_a); - } - packingA_done.at(mi)++; - } - } - - if (copyB) { - while (true) { - uint32_t nk = packingB_jobs++; - Index ni = nk / blocks_k; - Index ki = nk % blocks_k; - if (ni >= blocks_n) break; - - RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki); - if (transposeB) { - const RhsScalar * current_b = rightData + (ki*bk)*stride_b + - (ni*bn); - libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni), - actual_bk(ki), stride_b, bk); - } else { - const RhsScalar * current_b = rightData + (ni*bn)*stride_b + - (ki*bk); - internal::pack_simple(blockB, current_b, - actual_bn(ni), actual_bk(ki), bk, stride_b); - } - packingB_done.at(ni)++; - } - } - - // Compute - - while (true) { - uint32_t mn = compute_jobs++; - Index mi = mn / blocks_n; - Index ni = mn % blocks_n; - if (mi >= blocks_m) break; - - // Wait for mi, ni packings to be done. This is more fine-grained than - // waiting for all workers to finish packing. - while ((copyA && (packingA_done.at(mi) < blocks_k)) || - (copyB && (packingB_done.at(ni) < blocks_k))) - {} - - for (Index ki=0; ki < blocks_k; ++ki) { - const LhsScalar * current_a = copyA ? - blocksA + (bk*bm) * (mi*blocks_k+ki) : - leftData + (bk*ki)*stride_a + (bm*mi); - const RhsScalar * current_b = copyB ? - blocksB + (bk*bn) * (ni*blocks_k+ki) : - rightData + (ni*bn)*stride_b + (bk*ki); - - Index current_stride_a = copyA ? bm : stride_a; - Index current_stride_b = copyB ? bk : stride_b; - - // Memory may not be zeroed, overwrite instead of adding in first - // iteration. - float beta = ki == 0 ? 0 : 1; - - Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c; - internal::libxsmm_wrapper( - 0, actual_bm(mi), actual_bn(ni), actual_bk(ki), - current_stride_a, current_stride_b, stride_c, 1, beta, 0) - (current_a, current_b, current_c); - } - } - - workers_done.Notify(); - } - - void run() { - // Parallelization strategy. - // - // First pack A into blocks (sharding by m, k) and B (sharding by n,k), - // then shard by m, n. - // - // Do not use advanced ThreadPool queuing, just run a single long-standing - // function in each thread. - if (copyA) { - blocksA = static_cast(device.allocate( - (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar))); - } - if (copyB) { - blocksB = static_cast(device.allocate( - (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar))); - } - - for (Index i = 0; i < num_threads; ++i) { - device.enqueueNoNotification([=]() { worker(); }); - } - - workers_done.Wait(); - - if (copyA) { - device.deallocate(blocksA); - } - if (copyB) { - device.deallocate(blocksB); - } - } - - private: - // real block size for block index in [0, ..., blocks - 1]. - Index actual_bm(Index mi) const { - return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m; - } - Index actual_bk(Index ki) const { - return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k; - } - Index actual_bn(Index ni) const { - return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n; - } - - const Device& device; - Index m, k, n; - Index stride_a, stride_b, stride_c; - Index bm, bk, bn; // Block sizes. - Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension. - bool copyA, copyB, transposeA, transposeB; - Index num_threads; - Scalar *buffer; - const LhsScalar *leftData; - const RhsScalar *rightData; - - LhsScalar *blocksA; - RhsScalar *blocksB; - // barrier for joining all threads after all done. - Barrier workers_done; - // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q) - std::atomic packingA_jobs; - std::atomic packingB_jobs; - std::atomic compute_jobs; - // already packed blocks for each mi-panel in A and ni-panel in B. - std::vector> packingA_done; - std::vector> packingB_done; - }; -#endif - }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e6c757275..42a450a85 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -12,17 +12,6 @@ include_directories(../../test ../../unsupported ../../Eigen find_package (Threads) -find_package(Xsmm) -if(XSMM_FOUND) - add_definitions("-DEIGEN_USE_LIBXSMM") - include_directories(${XSMM_INCLUDES}) - link_directories(${XSMM_LIBRARIES}) - set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm) - ei_add_property(EIGEN_TESTED_BACKENDS "Xsmm, ") -else(XSMM_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ") -endif(XSMM_FOUND) - find_package(GoogleHash) if(GOOGLEHASH_FOUND) add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 75f2e1edf..2fd128121 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -511,8 +511,6 @@ static void test_const_inputs() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } -#if !defined(EIGEN_USE_LIBXSMM) - // Apply Sqrt to all output elements. struct SqrtOutputKernel { template @@ -562,9 +560,6 @@ static void test_large_contraction_with_output_kernel() { } } -#endif // !defined(EIGEN_USE_LIBXSMM) - - EIGEN_DECLARE_TEST(cxx11_tensor_contraction) { CALL_SUBTEST(test_evals()); @@ -597,8 +592,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_contraction) CALL_SUBTEST(test_tensor_product()); CALL_SUBTEST(test_const_inputs()); CALL_SUBTEST(test_const_inputs()); -#if !defined(EIGEN_USE_LIBXSMM) CALL_SUBTEST(test_large_contraction_with_output_kernel()); CALL_SUBTEST(test_large_contraction_with_output_kernel()); -#endif } From 6901788013b0148d62118b73ea5eca9c7140f0d7 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 22 Aug 2019 10:50:51 -0700 Subject: [PATCH 08/30] Asynchronous parallelFor in Eigen ThreadPoolDevice --- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 197 +++++++++++++----- 1 file changed, 144 insertions(+), 53 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index ef22a268a..ca2794cb5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -75,9 +75,9 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } - + template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { return data; } @@ -181,44 +181,173 @@ struct ThreadPoolDevice { return pool_->CurrentThreadId(); } - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is chosen based on the iteration cost and resulting parallel + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel // efficiency. If block_align is not nullptr, it is called to round up the // block size. void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, std::function f) const { - typedef TensorCostModel CostModel; + // Compute small problems directly in the caller thread. if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { f(0, n); return; } - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); - double block_size_f = 1.0 / CostModel::taskSize(1, cost); + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, + Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, NULL, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f, + std::function done) const { + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = + new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule( + [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Call 'done' callback if it was the last block. + if (ctx->count.fetch_sub(1) == 1) { + (ctx->done)(); + // We can't delete ctx right now, because it will deallocate the closure + // we are currently in. + pool_->Schedule([ctx]() { delete ctx; }); + } + }; + + // Execute the root in the thread pool. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function f, + std::function done) const { + parallelForAsync(n, cost, NULL, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } + + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index count, std::function f, + std::function done) + : count(count), f(std::move(f)), done(std::move(done)) {} + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock( + const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); const Index max_oversharding_factor = 4; Index block_size = numext::mini( - n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), - block_size_f)); + n, numext::maxi( + divup(n, max_oversharding_factor * numThreads()), + block_size_f)); const Index max_block_size = numext::mini(n, 2 * block_size); + if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); block_size = numext::mini(n, new_block_size); } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast(block_count) / (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. for (Index prev_block_count = block_count; @@ -251,47 +380,9 @@ struct ThreadPoolDevice { } } - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { - while (lastIdx - firstIdx > block_size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); - lastIdx = midIdx; - } - // Single block or less, execute directly. - f(firstIdx, lastIdx); - barrier.Notify(); - }; - if (block_count <= numThreads()) { - // Avoid a thread hop by running the root of the tree and one block on the - // main thread. - handleRange(0, n); - } else { - // Execute the root in the thread pool to avoid running work on more than - // numThreads() threads. - pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); - } - barrier.Wait(); + return {block_size, block_count}; } - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, NULL, std::move(f)); - } - - // Thread pool accessor. - ThreadPoolInterface* getPool() const { return pool_; } - - // Allocator accessor. - Allocator* allocator() const { return allocator_; } - - private: ThreadPoolInterface* pool_; int num_threads_; Allocator* allocator_; From 8b5ab0e4dd70f449db52503f89cbb3767ccec38c Mon Sep 17 00:00:00 2001 From: maratek Date: Fri, 23 Aug 2019 15:25:56 -0700 Subject: [PATCH 09/30] Fix get_random_seed on Native Client Newlib in Native Client SDK does not provide ::random function. Implement get_random_seed for NaCl using ::rand, similarly to Windows version. --- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 2be4f9cc5..445248163 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -45,6 +45,14 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { uint64_t rnd = ::random() ^ mach_absolute_time(); return rnd; +#elif defined __native_client__ + // Same approach as for win32, except using clock_gettime + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + int rnd1 = ::rand(); + int rnd2 = ::rand(); + uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec; + return rnd; #else // Augment the current time with pseudo random number generation From b021cdea6dd84b0f51dd7aea691d47dd3eab8e9d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 27 Aug 2019 11:30:31 -0700 Subject: [PATCH 10/30] Clean up float16 a.k.a. Eigen::half support in Eigen. Move the definition of half to Core/arch/Default and move arch-specific packet ops to their respective sub-directories. --- Eigen/Core | 9 +- Eigen/src/Core/arch/AVX/PacketMath.h | 362 ++++ Eigen/src/Core/arch/AVX/TypeCasting.h | 27 + Eigen/src/Core/arch/AVX512/PacketMath.h | 507 +++++- Eigen/src/Core/arch/AVX512/TypeCasting.h | 25 + Eigen/src/Core/arch/{GPU => Default}/Half.h | 18 +- Eigen/src/Core/arch/Default/TypeCasting.h | 77 + Eigen/src/Core/arch/GPU/PacketMath.h | 540 ++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 1630 ------------------- Eigen/src/Core/arch/GPU/TypeCasting.h | 161 -- Eigen/src/Core/arch/SSE/PacketMath.h | 208 +++ Eigen/src/Core/arch/SSE/TypeCasting.h | 51 + test/half_float.cpp | 2 +- 13 files changed, 1805 insertions(+), 1812 deletions(-) create mode 100644 Eigen/src/Core/arch/AVX512/TypeCasting.h rename Eigen/src/Core/arch/{GPU => Default}/Half.h (98%) create mode 100644 Eigen/src/Core/arch/Default/TypeCasting.h delete mode 100644 Eigen/src/Core/arch/GPU/PacketMathHalf.h diff --git a/Eigen/Core b/Eigen/Core index af741a241..e6e31caee 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -160,6 +160,9 @@ using std::ptrdiff_t; #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" #include "src/Core/arch/Default/ConjHelper.h" +// Generic half float support +#include "src/Core/arch/Default/Half.h" +#include "src/Core/arch/Default/TypeCasting.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" @@ -207,14 +210,10 @@ using std::ptrdiff_t; #include "src/Core/arch/MSA/Complex.h" #endif -// Half float support -#include "src/Core/arch/GPU/Half.h" -#include "src/Core/arch/GPU/PacketMathHalf.h" -#include "src/Core/arch/GPU/TypeCasting.h" - #if defined EIGEN_VECTORIZE_GPU #include "src/Core/arch/GPU/PacketMath.h" #include "src/Core/arch/GPU/MathFunctions.h" + #include "src/Core/arch/GPU/TypeCasting.h" #endif #if defined(EIGEN_USE_SYCL) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 5233195f3..020f6c276 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -31,10 +31,14 @@ namespace internal { typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; +typedef struct { + __m128i x; +} Packet8h; template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ const Packet8f p8f_##NAME = pset1(X) @@ -97,6 +101,35 @@ template<> struct packet_traits : default_packet_traits HasCeil = 1 }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; #endif template<> struct scalar_div_cost { enum { value = 14 }; }; @@ -847,6 +880,335 @@ template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) return _mm256_blend_pd(a,pset1(b),(1<<3)); } + +// Packet math for Eigen::half +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm_set_epi16(d, d, c, c, b, b, a, a); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; +} + +template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { + Packet8h r; r.x = _mm_blendv_epi8(b.x, a.x, mask.x); return r; +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pcmp_eq(af, bf); + // Pack the 32-bit flags into 16-bits flags. + Packet8h result; result.x = _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), + _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + Packet8h sign_mask; sign_mask.x = _mm_set1_epi16(static_cast(0x8000)); + Packet8h result; result.x = _mm_xor_si128(a.x, sign_mask.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preduxp(const Packet8h* p) { + Packet8f pf[8]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + Packet8f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet8h res; + res.x = _mm_shuffle_epi8(a.x,m); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),7); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) + { + if (Offset!=0) + first.x = _mm_alignr_epi8(second.x,first.x, Offset*2); + } +}; + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 7d2e1e67f..910fc06ca 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -52,6 +52,33 @@ template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Pa return _mm256_castsi256_ps(a); } + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 0a81fe02d..e37855693 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -44,6 +44,41 @@ template <> struct is_arithmetic<__m512d> { enum { value = true }; }; +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; template<> struct packet_traits : default_packet_traits { @@ -60,8 +95,8 @@ template<> struct packet_traits : default_packet_traits #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, + HasLog1p = 1, + HasExpm1 = 1, #endif HasExp = 1, HasSqrt = EIGEN_FAST_MATH, @@ -120,6 +155,13 @@ struct unpacket_traits { enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false }; }; +template<> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet16h half; + enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; + template <> EIGEN_STRONG_INLINE Packet16f pset1(const float& from) { return _mm512_set1_ps(from); @@ -1397,6 +1439,467 @@ template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const return _mm512_castsi512_ps(a); } + +// Packet math for Eigen::half +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploaddup(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { + Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + // in some cases Packet8i is a wrapper around __m256i, so we need to + // cast to Packet8i to call the correct overload. + Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { + Packet16h r; r.x = _mm256_blendv_epi8(b.x, a.x, mask.x); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pcmp_eq(af, bf); + // Pack the 32-bit flags into 16-bits flags. + __m256i lo = _mm256_castps_si256(extract256<0>(rf)); + __m256i hi = _mm256_castps_si256(extract256<1>(rf)); + __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), + _mm256_extractf128_si256(lo, 1)); + __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), + _mm256_extractf128_si256(hi, 1)); + Packet16h result; result.x = _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + Packet16h sign_mask; sign_mask.x = _mm256_set1_epi16(static_cast(0x8000)); + Packet16h result; result.x = _mm256_xor_si256(a.x, sign_mask.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h preduxp(const Packet16h* p) { + Packet16f pf[16]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + pf[8] = half2float(p[8]); + pf[9] = half2float(p[9]); + pf[10] = half2float(p[10]); + pf[11] = half2float(p[11]); + pf[12] = half2float(p[12]); + pf[13] = half2float(p[13]); + pf[14] = half2float(p[14]); + pf[15] = half2float(p[15]); + Packet16f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet16h res; + res.x = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,15); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h new file mode 100644 index 000000000..777a26ae4 --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -0,0 +1,25 @@ +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/Default/Half.h similarity index 98% rename from Eigen/src/Core/arch/GPU/Half.h rename to Eigen/src/Core/arch/Default/Half.h index 655dc20d5..56782b340 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -33,8 +33,8 @@ // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_GPU_H -#define EIGEN_HALF_GPU_H +#ifndef EIGEN_HALF_H +#define EIGEN_HALF_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -76,7 +76,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} #if defined(EIGEN_HAS_GPU_FP16) @@ -114,8 +113,7 @@ struct half : public half_impl::half_base { EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} - EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} - + #if defined(EIGEN_HAS_GPU_FP16) #if defined(EIGEN_HAS_HIP_FP16) EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} @@ -125,7 +123,7 @@ struct half : public half_impl::half_base { #endif #endif #endif - + explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -175,12 +173,6 @@ struct half : public half_impl::half_base { EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { return static_cast(half_impl::half_to_float(*this)); } - - EIGEN_DEVICE_FUNC half& operator=(const half& other) { - x = other.x; - return *this; - } - }; } // end namespace Eigen @@ -761,4 +753,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_GPU_H +#endif // EIGEN_HALF_H diff --git a/Eigen/src/Core/arch/Default/TypeCasting.h b/Eigen/src/Core/arch/Default/TypeCasting.h new file mode 100644 index 000000000..b6df98468 --- /dev/null +++ b/Eigen/src/Core/arch/Default/TypeCasting.h @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERIC_TYPE_CASTING_H +#define EIGEN_GENERIC_TYPE_CASTING_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + +} +} + +#endif // EIGEN_GENERIC_TYPE_CASTING_H diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 5084fc786..d1df26e57 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -458,6 +458,546 @@ ptranspose(PacketBlock& kernel) { #endif +// Packet math for Eigen::half +// Most of the following operations require arch >= 3.0 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \ + (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) + half2 r; + r.x = from; + r.y = from; + return r; +#else + return __half2half2(from); +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) + to[0] = from.x; + to[1] = from.y; +#else + to[0] = __low2half(from); + to[1] = __high2half(from); +#endif +} + +template<> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __ldg((const half2*)from); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 350 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif + +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(__ldg(from+0), __ldg(from+1)); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 350 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); + half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); + return __halves2half2(result1, result2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(a, __hadd(a, __float2half(1.0f))); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return __halves2half2(a, __float2half(f)); +#endif + +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, + const half2& a, + const half2& b) { + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; + return __halves2half2(eq1, eq2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); + return __halves2half2(result1, result2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hsub2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hneg2(a); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hfma2(a, b, c); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hfma2(a, b, c); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c1; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __h2div(a, b); + +#else // EIGEN_CUDA_ARCH + + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 + a2)); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 > a2 ? __low2half(a) : __high2half(a); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 < a2 ? __low2half(a) : __high2half(a); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 + return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 * a2)); +#endif + +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#else + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = logf(a1); + float r2 = logf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expf(a1); + float r2 = expf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = sqrtf(a1); + float r2 = sqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = rsqrtf(a1); + float r2 = rsqrtf(a2); + return __floats2half2_rn(r1, r2); +} +#endif + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h deleted file mode 100644 index 5e143e65a..000000000 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ /dev/null @@ -1,1630 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_HALF_GPU_H -#define EIGEN_PACKET_MATH_HALF_GPU_H - - -namespace Eigen { -namespace internal { - -// Most of the following operations require arch >= 3.0 -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \ - (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) - -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef half2 type; - typedef half2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasExpm1 = 1, - HasLog = 1, - HasLog1p = 1 - }; -}; - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) - half2 r; - r.x = from; - r.y = from; - return r; -#else - return __half2half2(from); -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) - to[0] = from.x; - to[1] = from.y; -#else - to[0] = __low2half(from); - to[1] = __high2half(from); -#endif -} - -template<> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { - -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __ldg((const half2*)from); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 350 - return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif - -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { - -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __halves2half2(__ldg(from+0), __ldg(from+1)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 350 - return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); - half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return __halves2half2(result1, result2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - return pset1(true_half); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { - half false_half = half_impl::raw_uint16_to_half(0x0000u); - return pset1(false_half); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __halves2half2(a, __hadd(a, __float2half(1.0f))); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif - -#endif -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, - const half2& a, - const half2& b) { - half mask_low = __low2half(mask); - half mask_high = __high2half(mask); - half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); - half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); - return __halves2half2(result_low, result_high); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, - const half2& b) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; - half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return __halves2half2(eq1, eq2); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return __halves2half2(result1, result2); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return __halves2half2(result1, result2); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return __halves2half2(result1, result2); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return __halves2half2(result1, result2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hadd2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hsub2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hneg2(a); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hmul2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hfma2(a, b, c); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __h2div(a, b); - -#else // EIGEN_CUDA_ARCH - - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hadd(__low2half(a), __high2half(a)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 + a2)); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hmul(__low2half(a), __high2half(a)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 * a2)); -#endif - -#endif -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = log1pf(a1); - float r2 = log1pf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expm1f(a1); - float r2 = expm1f(a2); - return __floats2half2_rn(r1, r2); -} - -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 plog(const half2& a) { - return h2log(a); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 pexp(const half2& a) { - return h2exp(a); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 psqrt(const half2& a) { - return h2sqrt(a); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 prsqrt(const half2& a) { - return h2rsqrt(a); -} - -#else - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -#endif - -#elif defined EIGEN_VECTORIZE_AVX512 - -typedef struct { - __m256i x; -} Packet16h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16h type; - // There is no half-size packet for Packet16h. - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet16h half; }; - -template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { - Packet16h result; - result.x = _mm256_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_load_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_loadu_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - // (void*) -> workaround clang warning: - // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 - _mm256_store_si256((__m256i*)(void*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - // (void*) -> workaround clang warning: - // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 - _mm256_storeu_si256((__m256i*)(void*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploaddup(const Eigen::half* from) { - Packet16h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - unsigned short e = from[4].x; - unsigned short f = from[5].x; - unsigned short g = from[6].x; - unsigned short h = from[7].x; - result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploadquad(const Eigen::half* from) { - Packet16h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtph_ps(a.x); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet16h result; - result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - Packet16h result; - result.x = _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { - Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; -} - -template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { - Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; -} - -template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { - // in some cases Packet8i is a wrapper around __m256i, so we need to - // cast to Packet8i to call the correct overload. - Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r; -} -template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r; -} -template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r; -} -template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; -} - -template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { - Packet16h r; r.x = _mm256_blendv_epi8(b.x, a.x, mask.x); return r; -} - -template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pcmp_eq(af, bf); - // Pack the 32-bit flags into 16-bits flags. - __m256i lo = _mm256_castps_si256(extract256<0>(rf)); - __m256i hi = _mm256_castps_si256(extract256<1>(rf)); - __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), - _mm256_extractf128_si256(lo, 1)); - __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), - _mm256_extractf128_si256(hi, 1)); - Packet16h result; result.x = _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { - Packet16h sign_mask; sign_mask.x = _mm256_set1_epi16(static_cast(0x8000)); - Packet16h result; result.x = _mm256_xor_si256(a.x, sign_mask.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = psub(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pdiv(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - -template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux_mul(from_float)); -} - -template<> EIGEN_STRONG_INLINE Packet16h preduxp(const Packet16h* p) { - Packet16f pf[16]; - pf[0] = half2float(p[0]); - pf[1] = half2float(p[1]); - pf[2] = half2float(p[2]); - pf[3] = half2float(p[3]); - pf[4] = half2float(p[4]); - pf[5] = half2float(p[5]); - pf[6] = half2float(p[6]); - pf[7] = half2float(p[7]); - pf[8] = half2float(p[8]); - pf[9] = half2float(p[9]); - pf[10] = half2float(p[10]); - pf[11] = half2float(p[11]); - pf[12] = half2float(p[12]); - pf[13] = half2float(p[13]); - pf[14] = half2float(p[14]); - pf[15] = half2float(p[15]); - Packet16f reduced = preduxp(pf); - return float2half(reduced); -} - -template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) -{ - __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); - Packet16h res; - res.x = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)), - _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1); - return res; -} - -template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b) -{ - Packet16h res; - res.x = _mm256_insert_epi16(a.x,b.x,0); - return res; -} - -template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b) -{ - Packet16h res; - res.x = _mm256_insert_epi16(a.x,b.x,15); - return res; -} - -template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) -{ - Packet16h result; - result.x = _mm256_set_epi16( - from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, - from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, - from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, - from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) -{ - EIGEN_ALIGN64 half aux[16]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; - to[stride*8].x = aux[8].x; - to[stride*9].x = aux[9].x; - to[stride*10].x = aux[10].x; - to[stride*11].x = aux[11].x; - to[stride*12].x = aux[12].x; - to[stride*13].x = aux[13].x; - to[stride*14].x = aux[14].x; - to[stride*15].x = aux[15].x; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m256i a = kernel.packet[0].x; - __m256i b = kernel.packet[1].x; - __m256i c = kernel.packet[2].x; - __m256i d = kernel.packet[3].x; - __m256i e = kernel.packet[4].x; - __m256i f = kernel.packet[5].x; - __m256i g = kernel.packet[6].x; - __m256i h = kernel.packet[7].x; - __m256i i = kernel.packet[8].x; - __m256i j = kernel.packet[9].x; - __m256i k = kernel.packet[10].x; - __m256i l = kernel.packet[11].x; - __m256i m = kernel.packet[12].x; - __m256i n = kernel.packet[13].x; - __m256i o = kernel.packet[14].x; - __m256i p = kernel.packet[15].x; - - __m256i ab_07 = _mm256_unpacklo_epi16(a, b); - __m256i cd_07 = _mm256_unpacklo_epi16(c, d); - __m256i ef_07 = _mm256_unpacklo_epi16(e, f); - __m256i gh_07 = _mm256_unpacklo_epi16(g, h); - __m256i ij_07 = _mm256_unpacklo_epi16(i, j); - __m256i kl_07 = _mm256_unpacklo_epi16(k, l); - __m256i mn_07 = _mm256_unpacklo_epi16(m, n); - __m256i op_07 = _mm256_unpacklo_epi16(o, p); - - __m256i ab_8f = _mm256_unpackhi_epi16(a, b); - __m256i cd_8f = _mm256_unpackhi_epi16(c, d); - __m256i ef_8f = _mm256_unpackhi_epi16(e, f); - __m256i gh_8f = _mm256_unpackhi_epi16(g, h); - __m256i ij_8f = _mm256_unpackhi_epi16(i, j); - __m256i kl_8f = _mm256_unpackhi_epi16(k, l); - __m256i mn_8f = _mm256_unpackhi_epi16(m, n); - __m256i op_8f = _mm256_unpackhi_epi16(o, p); - - __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); - __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); - __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); - __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); - __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); - __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); - __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); - __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); - - __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); - __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); - __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); - __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); - __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); - __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); - __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); - __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); - - __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); - __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); - __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); - __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); - __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); - __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); - __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); - __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); - __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); - __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); - __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); - __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); - __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); - __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); - __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); - __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); - - // NOTE: no unpacklo/hi instr in this case, so using permute instr. - __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); - - kernel.packet[0].x = a_p_0; - kernel.packet[1].x = a_p_1; - kernel.packet[2].x = a_p_2; - kernel.packet[3].x = a_p_3; - kernel.packet[4].x = a_p_4; - kernel.packet[5].x = a_p_5; - kernel.packet[6].x = a_p_6; - kernel.packet[7].x = a_p_7; - kernel.packet[8].x = a_p_8; - kernel.packet[9].x = a_p_9; - kernel.packet[10].x = a_p_a; - kernel.packet[11].x = a_p_b; - kernel.packet[12].x = a_p_c; - kernel.packet[13].x = a_p_d; - kernel.packet[14].x = a_p_e; - kernel.packet[15].x = a_p_f; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[8][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - pstore(in[4], kernel.packet[4]); - pstore(in[5], kernel.packet[5]); - pstore(in[6], kernel.packet[6]); - pstore(in[7], kernel.packet[7]); - - EIGEN_ALIGN64 half out[8][16]; - - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 8; ++j) { - out[i][j+8] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); - kernel.packet[4] = pload(out[4]); - kernel.packet[5] = pload(out[5]); - kernel.packet[6] = pload(out[6]); - kernel.packet[7] = pload(out[7]); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[4][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN64 half out[4][16]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][4*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][4*i+1]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+8] = in[j][4*i+2]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+12] = in[j][4*i+3]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -#elif defined EIGEN_VECTORIZE_AVX - -typedef struct { - __m128i x; -} Packet8h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8h type; - // There is no half-size packet for Packet8h. - typedef Packet8h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; - -template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { - Packet8h result; - result.x = _mm_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { - Packet8h result; - result.x = _mm_load_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { - Packet8h result; - result.x = _mm_loadu_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { - _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploaddup(const Eigen::half* from) { - Packet8h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - result.x = _mm_set_epi16(d, d, c, c, b, b, a, a); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploadquad(const Eigen::half* from) { - Packet8h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtph_ps(a.x); -#else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet8h result; - result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - Eigen::half h4(aux[4]); - Eigen::half h5(aux[5]); - Eigen::half h6(aux[6]); - Eigen::half h7(aux[7]); - - Packet8h result; - result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { - Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; -} - -template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { - // in some cases Packet4i is a wrapper around __m128i, so we either need to - // cast to Packet4i to directly call the intrinsics as below: - Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r; -} -template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r; -} -template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r; -} -template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; -} - -template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { - Packet8h r; r.x = _mm_blendv_epi8(b.x, a.x, mask.x); return r; -} - -template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pcmp_eq(af, bf); - // Pack the 32-bit flags into 16-bits flags. - Packet8h result; result.x = _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), - _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { - Packet8h sign_mask; sign_mask.x = _mm_set1_epi16(static_cast(0x8000)); - Packet8h result; result.x = _mm_xor_si128(a.x, sign_mask.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = psub(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pdiv(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) -{ - Packet8h result; - result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) -{ - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Packet8h preduxp(const Packet8h* p) { - Packet8f pf[8]; - pf[0] = half2float(p[0]); - pf[1] = half2float(p[1]); - pf[2] = half2float(p[2]); - pf[3] = half2float(p[3]); - pf[4] = half2float(p[4]); - pf[5] = half2float(p[5]); - pf[6] = half2float(p[6]); - pf[7] = half2float(p[7]); - Packet8f reduced = preduxp(pf); - return float2half(reduced); -} - -template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) -{ - __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); - Packet8h res; - res.x = _mm_shuffle_epi8(a.x,m); - return res; -} - -template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b) -{ - Packet8h res; - res.x = _mm_insert_epi16(a.x,int(b.x),0); - return res; -} - -template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) -{ - Packet8h res; - res.x = _mm_insert_epi16(a.x,int(b.x),7); - return res; -} - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) - { - if (Offset!=0) - first.x = _mm_alignr_epi8(second.x,first.x, Offset*2); - } -}; - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m128i a = kernel.packet[0].x; - __m128i b = kernel.packet[1].x; - __m128i c = kernel.packet[2].x; - __m128i d = kernel.packet[3].x; - __m128i e = kernel.packet[4].x; - __m128i f = kernel.packet[5].x; - __m128i g = kernel.packet[6].x; - __m128i h = kernel.packet[7].x; - - __m128i a03b03 = _mm_unpacklo_epi16(a, b); - __m128i c03d03 = _mm_unpacklo_epi16(c, d); - __m128i e03f03 = _mm_unpacklo_epi16(e, f); - __m128i g03h03 = _mm_unpacklo_epi16(g, h); - __m128i a47b47 = _mm_unpackhi_epi16(a, b); - __m128i c47d47 = _mm_unpackhi_epi16(c, d); - __m128i e47f47 = _mm_unpackhi_epi16(e, f); - __m128i g47h47 = _mm_unpackhi_epi16(g, h); - - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); - - __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); - __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); - __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); - __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); - __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); - __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); - __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); - __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); - - kernel.packet[0].x = a0b0c0d0e0f0g0h0; - kernel.packet[1].x = a1b1c1d1e1f1g1h1; - kernel.packet[2].x = a2b2c2d2e2f2g2h2; - kernel.packet[3].x = a3b3c3d3e3f3g3h3; - kernel.packet[4].x = a4b4c4d4e4f4g4h4; - kernel.packet[5].x = a5b5c5d5e5f5g5h5; - kernel.packet[6].x = a6b6c6d6e6f6g6h6; - kernel.packet[7].x = a7b7c7d7e7f7g7h7; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 Eigen::half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN32 Eigen::half out[4][8]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -typedef struct { - __m64 x; -} Packet4h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4h type; - // There is no half-size packet for Packet4h. - typedef Packet4h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; - -template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { - Packet4h result; - result.x = _mm_set1_pi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); -} - -template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha + hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha - hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha * hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha / hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE Packet4h -ploadquad(const Eigen::half* from) { - return pset1(*from); -} - -template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) -{ - Packet4h result; - result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) -{ - __int64_t a = _mm_cvtm64_si64(from.x); - to[stride*0].x = static_cast(a); - to[stride*1].x = static_cast(a >> 16); - to[stride*2].x = static_cast(a >> 32); - to[stride*3].x = static_cast(a >> 48); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); - __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); - - kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); - kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); - kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); - kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); -} - -#endif - -} -} - -#endif // EIGEN_PACKET_MATH_HALF_GPU_H diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 57a55d08b..c278f3fe8 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -14,64 +14,6 @@ namespace Eigen { namespace internal { -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __float2half(a); - #else - return Eigen::half(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __float2half(static_cast(a)); - #else - return Eigen::half(static_cast(a)); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __half2float(a); - #else - return static_cast(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) @@ -104,109 +46,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(cons return __floats2half2_rn(a.x, a.y); } -#elif defined EIGEN_VECTORIZE_AVX512 -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { - return float2half(a); -} - -#elif defined EIGEN_VECTORIZE_AVX - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { - return float2half(a); -} - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - Eigen::half h = raw_uint16_to_half(static_cast(a64)); - float f1 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 16)); - float f2 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 32)); - float f3 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 48)); - float f4 = static_cast(h); - return _mm_set_ps(f4, f3, f2, f1); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { - EIGEN_ALIGN16 float aux[4]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - - Packet4h result; - result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); - return result; -} - #endif } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 94603dd55..b59e2c602 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -1055,6 +1055,214 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } #endif + +// Packet math for Eigen::half +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index f607366f0..1b8e9a550 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -77,6 +77,57 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa return _mm_castsi128_ps(a); } + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/test/half_float.cpp b/test/half_float.cpp index 2a7f9b497..48afdb21b 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -9,7 +9,7 @@ #include "main.h" -#include +#include // Make sure it's possible to forward declare Eigen::half namespace Eigen { From 9aba527405b40132a308f5f782dacadf6ef50acd Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 27 Aug 2019 15:35:29 -0700 Subject: [PATCH 11/30] Revert changes to std_falback::log1p that broke handling of arguments less than -1. Fix packet op accordingly. --- Eigen/src/Core/MathFunctions.h | 3 +-- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 4 +--- test/packetmath.cpp | 2 -- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 1eeb2752b..fcf62011e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -550,9 +550,8 @@ namespace std_fallback { EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; Scalar log_1p = log(x1p); - const bool is_inf = numext::equal_strict(x1p, log_1p); const bool is_small = numext::equal_strict(x1p, Scalar(1)); - return (is_inf || is_small) ? x : x * (log_1p / (x1p - RealScalar(1))); + return is_small ? x : x * (log_1p / (x1p - RealScalar(1))); } } diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 640aae05a..505a0eec8 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -137,10 +137,8 @@ Packet generic_plog1p(const Packet& x) Packet xp1 = padd(x, one); Packet small_mask = pcmp_eq(xp1, one); Packet log1 = plog(xp1); - // Add a check to handle x == +inf. - Packet pos_inf_mask = pcmp_eq(x, log1); Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); - return pselect(por(small_mask, pos_inf_mask), x, log_large); + return pselect(small_mask, x, log_large); } /** \internal \returns exp(x)-1 computed using W. Kahan's formula. diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 41000a842..28768b18d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -607,8 +607,6 @@ template void packetmath_real() CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); - data1[0] = std::numeric_limits::infinity(); - data1[1] = std::numeric_limits::denorm_min(); CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p); #endif From 6e77f9bef35012f160b307bdeae73194fde91e51 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 28 Aug 2019 10:32:19 -0700 Subject: [PATCH 12/30] Remove shadow warnings in TensorDeviceThreadPool --- .../Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index ca2794cb5..edb0b3e25 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -90,7 +90,6 @@ struct ThreadPoolDevice { // CPU cycles due to the threads competing for memory bandwidth, so we // statically schedule at most 4 block copies here. const size_t kMinBlockSize = 32768; - typedef TensorCostModel CostModel; const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); if (n <= kMinBlockSize || num_threads < 2) { ::memcpy(dst, src, n); @@ -302,9 +301,12 @@ struct ThreadPoolDevice { // For parallelForAsync we must keep passed in closures on the heap, and // delete them only after `done` callback finished. struct ParallelForAsyncContext { - ParallelForAsyncContext(Index count, std::function f, - std::function done) - : count(count), f(std::move(f)), done(std::move(done)) {} + ParallelForAsyncContext(Index block_count, + std::function block_f, + std::function done_callback) + : count(block_count), + f(std::move(block_f)), + done(std::move(done_callback)) {} std::atomic count; std::function f; From 1187bb65ad196161a07f4e0125e478d022ea1b08 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 28 Aug 2019 12:20:21 -0700 Subject: [PATCH 13/30] Add more tests for corner cases of log1p and expm1. Add handling of infinite arguments to log1p such that log1p(inf) = inf. --- Eigen/src/Core/MathFunctions.h | 3 ++- .../Core/arch/Default/GenericPacketMathFunctions.h | 3 ++- test/packetmath.cpp | 14 +++++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index fcf62011e..fbec39d83 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -551,7 +551,8 @@ namespace std_fallback { Scalar x1p = RealScalar(1) + x; Scalar log_1p = log(x1p); const bool is_small = numext::equal_strict(x1p, Scalar(1)); - return is_small ? x : x * (log_1p / (x1p - RealScalar(1))); + const bool is_inf = numext::equal_strict(x1p, log_1p); + return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); } } diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 505a0eec8..0fc673e12 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -137,8 +137,9 @@ Packet generic_plog1p(const Packet& x) Packet xp1 = padd(x, one); Packet small_mask = pcmp_eq(xp1, one); Packet log1 = plog(xp1); + Packet inf_mask = pcmp_eq(xp1, log1); Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); - return pselect(small_mask, x, log_large); + return pselect(por(small_mask, inf_mask), x, log_large); } /** \internal \returns exp(x)-1 computed using W. Kahan's formula. diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 28768b18d..67ff6dc5b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -607,8 +607,12 @@ template void packetmath_real() CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); - CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); + data1[0] = std::numeric_limits::infinity(); + data1[1] = Scalar(-1); CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p); + data1[0] = std::numeric_limits::infinity(); + data1[1] = -std::numeric_limits::infinity(); + CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); #endif if(PacketSize>=2) @@ -648,6 +652,14 @@ template void packetmath_real() h.store(data2, internal::plog(h.load(data1))); VERIFY((numext::isinf)(data2[0])); } + if(PacketTraits::HasLog1p) { + packet_helper h; + data1[0] = Scalar(-2); + data1[1] = -std::numeric_limits::infinity(); + h.store(data2, internal::plog1p(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY((numext::isnan)(data2[1])); + } if(PacketTraits::HasSqrt) { packet_helper h; From bc40d4522c56fdf861fcdab28f4b7db609d8065e Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 28 Aug 2019 17:46:05 -0700 Subject: [PATCH 14/30] Const correctness in TensorMap> expressions --- .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorMap.h | 66 +++++++++++-------- unsupported/test/cxx11_tensor_map.cpp | 41 +++++++++--- 3 files changed, 74 insertions(+), 34 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index a95f22631..3cca0c7e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,6 +20,7 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; + typedef const T* ConstType; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 395cdf9c8..b28cd822f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -42,13 +42,27 @@ template class MakePoin typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; - /* typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType;*/ typedef typename MakePointer_::Type PointerType; - typedef PointerType PointerArgType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + >::type StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar&, + const Scalar& + >::type StorageRefType; static const int Options = Options_; @@ -63,47 +77,47 @@ template class MakePoin }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -120,9 +134,9 @@ template class MakePoin EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerType data() { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } + EIGEN_STRONG_INLINE PointerConstType data() const { return m_data; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const @@ -213,7 +227,7 @@ template class MakePoin #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -226,14 +240,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() + EIGEN_STRONG_INLINE StorageRefType operator()() { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -241,7 +255,7 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); @@ -256,7 +270,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -267,7 +281,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -278,7 +292,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -289,7 +303,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -320,7 +334,7 @@ template class MakePoin } private: - typename MakePointer_::Type m_data; + StoragePointerType m_data; Dimensions m_dimensions; }; diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index ce608aca7..dc8532f5c 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -19,8 +19,8 @@ static void test_0d() Tensor scalar1; Tensor scalar2; - TensorMap > scalar3(scalar1.data()); - TensorMap > scalar4(scalar2.data()); + TensorMap > scalar3(scalar1.data()); + TensorMap > scalar4(scalar2.data()); scalar1() = 7; scalar2() = 13; @@ -37,8 +37,8 @@ static void test_1d() Tensor vec1(6); Tensor vec2(6); - TensorMap > vec3(vec1.data(), 6); - TensorMap > vec4(vec2.data(), 6); + TensorMap > vec3(vec1.data(), 6); + TensorMap > vec4(vec2.data(), 6); vec1(0) = 4; vec2(0) = 0; vec1(1) = 8; vec2(1) = 1; @@ -85,8 +85,8 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; - TensorMap > mat3(mat1.data(), 2, 3); - TensorMap > mat4(mat2.data(), 2, 3); + TensorMap > mat3(mat1.data(), 2, 3); + TensorMap > mat4(mat2.data(), 2, 3); VERIFY_IS_EQUAL(mat3.rank(), 2); VERIFY_IS_EQUAL(mat3.size(), 6); @@ -129,8 +129,8 @@ static void test_3d() } } - TensorMap > mat3(mat1.data(), 2, 3, 7); - TensorMap > mat4(mat2.data(), 2, 3, 7); + TensorMap > mat3(mat1.data(), 2, 3, 7); + TensorMap > mat4(mat2.data(), 2, 3, 7); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); @@ -265,6 +265,29 @@ static void test_casting() VERIFY_IS_EQUAL(sum1, 861); } +template +static const T& add_const(T& value) { + return value; +} + +static void test_0d_const_tensor() +{ + Tensor scalar1; + Tensor scalar2; + + TensorMap > scalar3(add_const(scalar1).data()); + TensorMap > scalar4(add_const(scalar2).data()); + + scalar1() = 7; + scalar2() = 13; + + VERIFY_IS_EQUAL(scalar1.rank(), 0); + VERIFY_IS_EQUAL(scalar1.size(), 1); + + VERIFY_IS_EQUAL(scalar3(), 7); + VERIFY_IS_EQUAL(scalar4(), 13); +} + EIGEN_DECLARE_TEST(cxx11_tensor_map) { CALL_SUBTEST(test_0d()); @@ -274,4 +297,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_map) CALL_SUBTEST(test_from_tensor()); CALL_SUBTEST(test_casting()); + + CALL_SUBTEST(test_0d_const_tensor()); } From f6c51d9209ccc04d28c39f4c8059e7d3e74d6e07 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 30 Aug 2019 14:03:29 -0700 Subject: [PATCH 15/30] Fix missing header inclusion and colliding definitions for half type casting, which broke build with -march=native on Haswell/Skylake. --- Eigen/Core | 1 + Eigen/src/Core/arch/AVX/TypeCasting.h | 3 +++ Eigen/src/Core/arch/AVX512/TypeCasting.h | 22 ++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index e6e31caee..9d8f8fce8 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -172,6 +172,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" + #include "src/Core/arch/AVX512/TypeCasting.h" #include "src/Core/arch/AVX512/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 910fc06ca..181043588 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -52,6 +52,7 @@ template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Pa return _mm256_castsi256_ps(a); } +#ifndef EIGEN_VECTORIZE_AVX512 template <> struct type_casting_traits { @@ -75,6 +76,8 @@ struct type_casting_traits { }; }; +#endif // EIGEN_VECTORIZE_AVX512 + template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { return float2half(a); } diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h index 777a26ae4..a82176941 100644 --- a/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -1,3 +1,19 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_AVX512_H +#define EIGEN_TYPE_CASTING_AVX512_H + +namespace Eigen { + +namespace internal { + template <> struct type_casting_traits { enum { @@ -23,3 +39,9 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { return float2half(a); } + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_AVX512_H From 66665e7e76d2ad5aa37775b3777e9a53c6d1c18c Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 30 Aug 2019 14:49:40 -0700 Subject: [PATCH 16/30] Asynchronous expression evaluation with TensorAsyncDevice --- unsupported/Eigen/CXX11/ThreadPool | 2 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 11 + .../Eigen/CXX11/src/Tensor/TensorBlock.h | 1 + .../Eigen/CXX11/src/Tensor/TensorDevice.h | 41 +++ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 271 +++++++++++++++--- .../src/Tensor/TensorForwardDeclarations.h | 6 + unsupported/test/cxx11_tensor_executor.cpp | 107 ++++++- unsupported/test/cxx11_tensor_thread_pool.cpp | 39 ++- 8 files changed, 414 insertions(+), 64 deletions(-) diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index d046af9b2..613fdb57a 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index dbacf494e..095c85dc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -1063,6 +1063,17 @@ class TensorBase : public TensorBase { return TensorDevice(dev, derived()); } +#ifdef EIGEN_USE_THREADS + // Select the async device on which to evaluate the expression. + template + typename internal::enable_if< + internal::is_same::value, + TensorAsyncDevice>::type + device(const DeviceType& dev, std::function done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); + } +#endif // EIGEN_USE_THREADS + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 49fb21dc8..c8a8b16db 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -932,6 +932,7 @@ class TensorBlockMapper { typedef TensorBlock Block; typedef DSizes Dimensions; + TensorBlockMapper() {} TensorBlockMapper(const Dimensions& dims, const TensorBlockShapeType block_shape, Index min_target_size) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 29e50a3b2..5122b3623 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -63,6 +63,47 @@ template class TensorDevice { ExpressionType& m_expression; }; +#ifdef EIGEN_USE_THREADS + +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device (currently supports only + * ThreadPoolDevice). + * + * Example: + * std::function done = []() {}; + * C.device(EIGEN_THREAD_POOL, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, + std::function done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + std::function m_done; +}; + +#endif // EIGEN_USE_THREADS + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index fddb90d77..18d9de9e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -84,7 +84,7 @@ class TensorExecutor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Device& device = Device()) { + const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { @@ -97,6 +97,14 @@ class TensorExecutor { } }; +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + /** * Process all the data with a single cpu thread, using vectorized instructions. */ @@ -107,8 +115,8 @@ class TensorExecutor evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { @@ -206,8 +214,81 @@ class TensorExecutor +struct TensorExecutorTilingContext { + typedef typename TensorBlockMapper::Block TensorBlock; + + TensorExecutorTilingContext() : buffer(nullptr) {} + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, + const TensorOpCost& b_cost, void* b_buffer, + size_t b_aligned_size) + : block_mapper(b_mapper), + cost(b_cost), + buffer(b_buffer), + aligned_blocksize(b_aligned_size) {} + + template + Scalar* GetCurrentThreadBuffer(const ThreadPoolDevice& device) const { + // ThreadPoolDevice::currentThreadId() returns -1 if called from a thread + // not in the thread pool, such as the main thread dispatching Eigen + // expressions. + const int thread_idx = device.currentThreadId(); + eigen_assert(thread_idx >= -1 && thread_idx < device.numThreads()); + + const Index offset = aligned_blocksize * (thread_idx + 1); + return reinterpret_cast(static_cast(buffer) + offset); + } + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + void* buffer; // temporary buffer for blocks + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext( + const ThreadPoolDevice& device, const Evaluator& evaluator) { + // Prefer blocks skewed toward inner dimension. + TensorBlockShapeType block_shape = kSkewedInnerDims; + Index block_total_size = 0; + + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + int num_threads = device.numThreads(); + + // Estimate minimum block size based on cost. + TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); + double taskSize = TensorCostModel::taskSize(1, cost); + size_t block_size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper( + typename TensorBlockMapper::Dimensions(evaluator.dimensions()), + block_shape, block_size); + + block_size = block_mapper.block_dims_total_size(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * + divup(block_size * sizeof(typename Evaluator::Scalar), align); + void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + + return {block_mapper, cost * block_size, buf, aligned_blocksize}; +} + template struct EvalRange { static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, @@ -274,7 +355,7 @@ class TensorExecutor { typedef EvalRange EvalRange; Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), @@ -290,18 +371,18 @@ class TensorExecutor { template class TensorExecutor { public: + typedef typename traits::Index StorageIndex; typedef typename traits::Scalar Scalar; typedef typename remove_const::type ScalarNoConst; - typedef TensorEvaluator Evaluator; - typedef typename traits::Index StorageIndex; - static const int NumDims = traits::NumDimensions; + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { - typedef TensorBlockMapper TensorBlockMapper; - Evaluator evaluator(expr, device); Index total_size = array_prod(evaluator.dimensions()); Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); @@ -315,50 +396,152 @@ class TensorExecutor resources; - evaluator.getResourceRequirements(&resources); - MergeResourceRequirements(resources, &block_shape, &block_total_size); - int num_threads = device.numThreads(); + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(device, evaluator); - // Estimate minimum block size based on cost. - TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); - double taskSize = TensorCostModel::taskSize(1, cost); - size_t block_size = static_cast(1.0 / taskSize); - TensorBlockMapper block_mapper( - typename TensorBlockMapper::Dimensions(evaluator.dimensions()), - block_shape, block_size); - block_size = block_mapper.block_dims_total_size(); - const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - const size_t aligned_blocksize = - align * divup(block_size * sizeof(Scalar), align); - void* buf = device.allocate((num_threads + 1) * aligned_blocksize); device.parallelFor( - block_mapper.total_block_count(), cost * block_size, - [=, &device, &evaluator, &block_mapper](StorageIndex firstIdx, - StorageIndex lastIdx) { - // currentThreadId() returns -1 if called from a thread not in the - // thread pool, such as the main thread dispatching Eigen - // expressions. - const int thread_idx = device.currentThreadId(); - eigen_assert(thread_idx >= -1 && thread_idx < num_threads); - ScalarNoConst* thread_buf = reinterpret_cast( - static_cast(buf) + aligned_blocksize * (thread_idx + 1)); + tiling.block_mapper.total_block_count(), tiling.cost, + [=, &device, &evaluator, &tiling](StorageIndex firstIdx, + StorageIndex lastIdx) { + ScalarNoConst* thread_buf = + tiling.template GetCurrentThreadBuffer(device); for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = block_mapper.GetBlockForIndex(i, thread_buf); + auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf); evaluator.evalBlock(&block); } }); - device.deallocate(buf); + device.deallocate(tiling.buffer); } evaluator.cleanup(); } }; +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + std::function done) { + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + // TODO(ezhulenev): This is a potentially blocking operation. Make it async! + const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr); + + typedef EvalRange EvalRange; + + if (needs_assign) { + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); + }, + [ctx]() { delete ctx; }); + } + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + std::function done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + on_done(); + evaluator.cleanup(); + } + + Evaluator evaluator; + + private: + std::function on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index StorageIndex; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + std::function done) { + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + Index total_size = array_prod(ctx->evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + if (total_size < cache_size && + !ExpressionHasTensorBroadcastingOp::value) { + internal::TensorAsyncExecutor::runAsync( + expr, device, [ctx]() { delete ctx; }); + return; + } + + // TODO(ezhulenev): This is a potentially blocking operation. Make it async! + const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr); + + if (needs_assign) { + ctx->tiling = + internal::GetTensorExecutorTilingContext(device, ctx->evaluator); + + device.parallelForAsync( + ctx->tiling.block_mapper.total_block_count(), ctx->tiling.cost, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { + ScalarNoConst* thread_buf = + ctx->tiling.template GetCurrentThreadBuffer(ctx->device); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { + auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf); + ctx->evaluator.evalBlock(&block); + } + }, + [ctx]() { delete ctx; }); + } + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + std::function done) + : device(thread_pool), + evaluator(expr, thread_pool), + on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + on_done(); + device.deallocate(tiling.buffer); + evaluator.cleanup(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + std::function on_done; + }; +}; + #endif // EIGEN_USE_THREADS @@ -419,7 +602,7 @@ template EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { const int block_size = device.maxGpuThreadsPerBlock(); @@ -517,10 +700,10 @@ struct ExecExprFunctorKernel template class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index Index; static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { Eigen::TensorEvaluator evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { Index range, GRange, tileSize; Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 3cca0c7e9..e823bd932 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -94,6 +94,7 @@ template class MakePointer_ = MakePointer> cl template class TensorForcedEvalOp; template class TensorDevice; +template class TensorAsyncDevice; template struct TensorEvaluator; struct NoOpOutputKernel; @@ -167,6 +168,11 @@ template ::value> class TensorExecutor; +template ::value, + bool Tileable = IsTileable::value> +class TensorAsyncExecutor; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index e9922a48d..f4d0401da 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -562,37 +562,112 @@ static void test_execute_reverse_rvalue(Device d) } } +template +static void test_async_execute_unary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims(50 / NumDims, 100 / NumDims); + + Tensor src(dims); + Tensor dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + using Assign = TensorAssignOp; + using Executor = internal::TensorAsyncExecutor; + Eigen::Barrier done(1); + Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); }); + done.Wait(); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + +template +static void test_async_execute_binary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims(50 / NumDims, 100 / NumDims); + + Tensor lhs(dims); + Tensor rhs(dims); + Tensor dst(dims); + + lhs.setRandom(); + rhs.setRandom(); + + const auto expr = lhs + rhs; + + using Assign = TensorAssignOp; + using Executor = internal::TensorAsyncExecutor; + + Eigen::Barrier done(1); + Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); }); + done.Wait(); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T sum = lhs.coeff(i) + rhs.coeff(i); + VERIFY_IS_EQUAL(sum, dst.coeff(i)); + } +} + #ifdef EIGEN_DONT_VECTORIZE #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL -#else +#else #define VECTORIZABLE(VAL) VAL #endif #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART -#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ +#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))) +// NOTE: Currently only ThreadPoolDevice supports async expression evaluation. +#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))) + EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::DefaultDevice default_device; + // Default device is unused in ASYNC tests. + EIGEN_UNUSED_VARIABLE(default_device); - const auto num_threads = internal::random(1, 24); + const auto num_threads = internal::random(20, 24); Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); @@ -660,8 +735,16 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5); + + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3); + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4); + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5); + // Force CMake to split this test. - // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14 + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16 } #undef CALL_SUBTEST_COMBINATIONS diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index f8a7b3662..53b50d1ed 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -38,9 +38,9 @@ class TestAllocator : public Allocator { void test_multithread_elementwise() { - Tensor in1(2,3,7); - Tensor in2(2,3,7); - Tensor out(2,3,7); + Tensor in1(200, 30, 70); + Tensor in2(200, 30, 70); + Tensor out(200, 30, 70); in1.setRandom(); in2.setRandom(); @@ -49,15 +49,39 @@ void test_multithread_elementwise() Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(3, 11)); out.device(thread_pool_device) = in1 + in2 * 3.14f; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f); } } } } +void test_async_multithread_elementwise() +{ + Tensor in1(200, 30, 70); + Tensor in2(200, 30, 70); + Tensor out(200, 30, 70); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPool tp(internal::random(3, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(3, 11)); + + Eigen::Barrier b(1); + out.device(thread_pool_device, [&b]() { b.Notify(); }) = in1 + in2 * 3.14f; + b.Wait(); + + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f); + } + } + } +} void test_multithread_compound_assignment() { @@ -516,6 +540,7 @@ void test_threadpool_allocate(TestAllocator* allocator) EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) { CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_async_multithread_elementwise()); CALL_SUBTEST_1(test_multithread_compound_assignment()); CALL_SUBTEST_2(test_multithread_contraction()); From 619cea94916e7531a839ee0ff657714857921db8 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 30 Aug 2019 14:51:17 -0700 Subject: [PATCH 17/30] Revert accidentally removed header from ThreadPool --- unsupported/Eigen/CXX11/ThreadPool | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 613fdb57a..7a795da3d 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" From f0b36fb9a405400e82b73ea70097b8ae3cd1095a Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 30 Aug 2019 15:13:38 -0700 Subject: [PATCH 18/30] evalSubExprsIfNeededAsync + async TensorContractionThreadPool --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 12 + .../CXX11/src/Tensor/TensorBroadcasting.h | 8 + .../CXX11/src/Tensor/TensorContraction.h | 139 ++-- .../src/Tensor/TensorContractionThreadPool.h | 711 ++++++++++++------ .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 35 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 51 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 36 +- .../Eigen/CXX11/src/ThreadPool/Barrier.h | 3 + unsupported/test/cxx11_tensor_thread_pool.cpp | 140 ++++ 9 files changed, 833 insertions(+), 302 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index d6e51bc6c..270ad974e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -147,6 +147,18 @@ struct TensorEvaluator, Device> // by the rhs to the lhs. return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync( + m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 10bdbc6a0..b290de311 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -214,6 +214,14 @@ struct TensorEvaluator, Device> return true; } +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index a398b2b3f..2f8656fbb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -373,13 +373,13 @@ struct TensorContractionEvaluatorBase typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; // Most of the code is assuming that both input tensors are ColMajor. If the @@ -390,7 +390,7 @@ struct TensorContractionEvaluatorBase static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - + typedef TensorEvaluator LeftEvaluatorType; typedef TensorEvaluator RightEvaluatorType; @@ -605,48 +605,99 @@ struct TensorContractionEvaluatorBase } } -#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHODARGS; \ - } \ - else { \ - METHODARGS; \ - } \ - } \ - else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHODARGS; \ - } \ - else { \ - METHODARGS; \ - } \ - } \ - } \ - else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHODARGS; \ - } \ - else { \ - METHODARGS; \ - } \ - } \ - else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHODARGS; \ - } \ - else { \ - METHODARGS; \ - } \ - } \ - } +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); + } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } + +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { static_cast(this)->template evalProduct(buffer); } +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this) + ->template evalProductAsync(buffer, + std::move(done)); + } +#endif // EIGEN_USE_THREADS + template void evalProductSequential(Scalar* buffer) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index ca20038a4..f9d9d6d31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -73,6 +73,34 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, requried for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = + std::is_same::value; + const Index m = this->m_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; @@ -134,8 +162,16 @@ struct TensorEvaluatortemplate evalShardedByInnerDim(num_threads_by_k, - buffer); + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = new EvalShardedByInnerDimContext( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + return; } @@ -146,6 +182,7 @@ struct TensorEvaluatortemplate evalProductSequential, Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); return; } @@ -230,21 +267,89 @@ struct TensorEvaluator - class Context { + // ------------------------------------------------------------------------ // + + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { + eigen_assert(false && "NoCallback should never be called"); + } + }; + + // ------------------------------------------------------------------------ // + + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) + : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { public: typedef internal::TensorContractionInputMapper< LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, @@ -267,11 +372,15 @@ struct TensorEvaluatorm_device), + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, + Index tm, Index tn, Index tk, Index bm, Index bn, + Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack, + bool parallelize_by_sharding_dim_only, + DoneCallback done) + : done_(this, std::move(done)), + device_(self->m_device), lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, self->m_k_strides), @@ -299,8 +408,7 @@ struct TensorEvaluator done_; + const Device& device_; LhsMapper lhs_; RhsMapper rhs_; @@ -780,10 +900,344 @@ struct TensorEvaluator + using SyncEvalParallelContext = + EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* evaluator, int num_threads, + Scalar* result, Index m, Index n, Index k, + DoneCallback done) + : evaluator(evaluator), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + num_threads(num_threads), + result(result), + m(m), + n(n), + k(k), + done(std::move(done)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(divup(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(divup(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast(evaluator->m_device.allocate( + buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + evaluator->m_device.enqueueNoNotification( + [this, block_idx, &barrier]() { + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + }); + } + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + evaluator->m_device.enqueueNoNotification([this, block_idx]() { + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + }); + } + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + int num_threads; + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + ::memset(buf, 0, buffer_size_bytes); + + TENSOR_CONTRACTION_DISPATCH( + evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel( + OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, + Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, + Scalar* tgt_buf) { + const int output_packet_size = + internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = + internal::pload(src_buf + i); + const PacketReturnType tgt_val = + internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, + sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, + const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = + padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; + }; + + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + // Decide whether we want to shard m x n contraction by columns or by rows. static bool shardByCol(Index m, Index n, Index num_threads) { // Note: we are comparing both n and m against Traits::nr, it is not @@ -916,55 +1370,6 @@ struct TensorEvaluator - EIGEN_STRONG_INLINE void addToBuffer(size_t n, const Scalar* src_buf, - Scalar* tgt_buf) const { - const int output_packet_size = internal::unpacket_traits::size; - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const PacketReturnType src_val = - internal::pload(src_buf + i); - const PacketReturnType tgt_val = - internal::ploadt(tgt_buf + i); - const PacketReturnType sum = internal::padd(src_val, tgt_val); - internal::pstoret(tgt_buf + i, sum); - } - for (; i < n; ++i) { - tgt_buf[i] += src_buf[i]; - } - } - - template - EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0, - const Scalar* src_buf1, - const Scalar* src_buf2, - Scalar* dst_buf) const { - using ::Eigen::internal::padd; - using ::Eigen::internal::pload; - using ::Eigen::internal::ploadt; - using ::Eigen::internal::pstoret; - - const int output_packet_size = - internal::unpacket_traits::size; - - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const auto src_val0 = pload(src_buf0 + i); - const auto src_val1 = pload(src_buf1 + i); - const auto src_val2 = pload(src_buf2 + i); - - const auto dst_val = ploadt(dst_buf + i); - const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); - - pstoret(dst_buf + i, sum); - } - for (; i < n; ++i) { - dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; - } - } - // Decide whether we want to shard m x k x n contraction over the inner // (contraction) dimension (k). static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, @@ -992,163 +1397,6 @@ struct TensorEvaluator - void evalShardedByInnerDim(int num_threads, Scalar* result) const { - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - - // We will compute partial results into the buffers of this size. - const Index buffer_size_bytes = m * n * sizeof(Scalar); - - // The underlying GEMM kernel assumes that k is a multiple of - // the packet size and subtle breakage occurs if this is violated. - const Index packet_size = internal::packet_traits::size; - - const auto round_up = [=](Index index) -> Index { - const Index kmultiple = packet_size <= 8 ? 8 : packet_size; - return divup(index, kmultiple) * kmultiple; - }; - - // Cost model doesn't capture well the cost associated with constructing - // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and - // gemm_pack_rhs, so we specify minimum desired block size. - const Index target_block_size = round_up(divup(k, num_threads)); - const Index desired_min_block_size = 12 * packet_size; - - const Index block_size = numext::mini( - k, numext::maxi(desired_min_block_size, target_block_size)); - const Index num_blocks = divup(k, block_size); - - // Compute block size with accounting for potentially incomplete last block. - const auto actual_block_size = [=](Index block_idx) -> Index { - return block_idx + 1 < num_blocks - ? block_size - : k + block_size - block_size * num_blocks; - }; - - // We compute partial gemm results in parallel, and to get the final result - // we need to add them all together. For the large number of threads (>= 48) - // this adds a very expensive sequential step at the end. - // - // We split the [0, num_blocks) into small ranges, and when a task for the - // block finishes its partial gemm computation, it checks if it was the last - // gemm in the range, and if so, it will add all blocks of the range. - // - // After all tasks finihes, we need to add only these pre-aggregated blocks. - - // Compute range size with accounting for potentially incomplete last range. - const auto actual_range_size = [=](Index num_ranges, Index range_size, - Index range_idx) -> Index { - eigen_assert(range_idx < num_ranges); - return range_idx + 1 < num_ranges - ? range_size - : num_blocks + range_size - range_size * num_ranges; - }; - - // For now we use just a single level of ranges to compute pre-aggregated - // partial sums, but in general we can use more layers to compute tree - // aggregation in parallel and reduce the size of the sequential step. - // - // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make - // sense only if number of threads >= ~128? - static const Index l0_size = 4; - const Index l0_ranges = divup(num_blocks, l0_size); - - // Keep count of pending gemm tasks for each l0 range. - MaxSizeVector> l0_state(l0_ranges); - for (int i = 0; i < l0_ranges; ++i) { - const Index num_pending_tasks = actual_range_size(l0_ranges, l0_size, i); - l0_state.emplace_back(internal::convert_index(num_pending_tasks)); - } - - MaxSizeVector block_buffers(num_blocks); - - auto process_block = [&, this](Index block_idx, Index begin, Index end) { - Scalar* buf = block_buffers[block_idx]; - ::memset(buf, 0, buffer_size_bytes); - - TENSOR_CONTRACTION_DISPATCH( - this->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, - /*num_threads=*/internal::convert_index(num_blocks))); - - // Check if it was the last task in l0 range. - const Index l0_index = block_idx / l0_size; - const int v = l0_state[l0_index].fetch_sub(1); - eigen_assert(v >= 1); - - // If we processed the last block of the range, we can aggregate all - // partial results into the first block of the range. - if (v == 1) { - const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index); - const Index dst_block_idx = l0_index * l0_size; - - if (rng_size == l0_size) { - addAllToBuffer( - m * n, - /*src_buf0=*/block_buffers[dst_block_idx + 1], - /*src_buf1=*/block_buffers[dst_block_idx + 2], - /*src_buf2=*/block_buffers[dst_block_idx + 3], - /*dst_buf= */ block_buffers[dst_block_idx]); - } else { - // Aggregate blocks of potentially incomplete last range. - for (int i = 1; i < rng_size; ++i) { - addToBuffer(m * n, - /*src_buf=*/block_buffers[dst_block_idx + i], - /*dst_buf=*/block_buffers[dst_block_idx]); - } - } - } - }; - - Barrier barrier(internal::convert_index(num_blocks)); - for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { - Scalar* buf = block_idx == 0 - ? result - : static_cast( - this->m_device.allocate(buffer_size_bytes)); - block_buffers.push_back(buf); - - Index block_start = block_idx * block_size; - Index block_end = block_start + actual_block_size(block_idx); - - this->m_device.enqueueNoNotification([=, &barrier, &process_block]() { - process_block(block_idx, block_start, block_end); - barrier.Notify(); - }); - } - barrier.Wait(); - - // Aggregate partial sums from l0 ranges. - Index l0_index = 1; - for (; l0_index + 2 < l0_ranges; l0_index += 3) { - addAllToBuffer( - m * n, - /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], - /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], - /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], - /*dst_buf= */block_buffers[0]); - } - for (; l0_index < l0_ranges; ++l0_index) { - addToBuffer(m * n, block_buffers[l0_index * l0_size], - block_buffers[0]); - } - - // Don't forget to deallocate ALL temporary buffers. - for (Index i = 1; i < num_blocks; ++i) { - this->m_device.deallocate(block_buffers[i]); - } - - // Finally call output kernel with finalized output buffer. - typedef internal::blas_data_mapper OutputMapper; - this->m_output_kernel(OutputMapper(result, m), - this->m_tensor_contraction_params, - static_cast(0), - static_cast(0), - m, n); - } - TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { // Compute cost. const int output_packet_size = internal::unpacket_traits::size; @@ -1188,7 +1436,6 @@ struct TensorEvaluator f) const { - parallelFor(n, cost, NULL, std::move(f)); + parallelFor(n, cost, nullptr, std::move(f)); } // WARNING: This function is asynchronous and will not block the calling thread. @@ -248,6 +248,14 @@ struct ThreadPoolDevice { std::function block_align, std::function f, std::function done) const { + // Compute small problems directly in the caller thread. + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + done(); + return; + } + // Compute block size and total count of blocks. ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); @@ -269,24 +277,26 @@ struct ThreadPoolDevice { // Single block or less, execute directly. ctx->f(firstIdx, lastIdx); - // Call 'done' callback if it was the last block. - if (ctx->count.fetch_sub(1) == 1) { - (ctx->done)(); - // We can't delete ctx right now, because it will deallocate the closure - // we are currently in. - pool_->Schedule([ctx]() { delete ctx; }); - } + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; }; - // Execute the root in the thread pool. - pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } } // Convenience wrapper for parallelForAsync that does not align blocks. void parallelForAsync(Index n, const TensorOpCost& cost, std::function f, std::function done) const { - parallelForAsync(n, cost, NULL, std::move(f), std::move(done)); + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); } // Thread pool accessor. @@ -307,6 +317,7 @@ struct ThreadPoolDevice { : count(block_count), f(std::move(block_f)), done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } std::atomic count; std::function f; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index a3a79d4e9..fec735868 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -79,7 +79,16 @@ struct TensorEvaluator return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); @@ -247,6 +256,15 @@ struct TensorEvaluator return true; } +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -346,6 +364,15 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -425,6 +452,15 @@ struct TensorEvaluator, Device> m_argImpl.evalSubExprsIfNeeded(NULL); return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -546,6 +582,19 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, + [done](bool) { done(true); }); + }); + } +#endif // EIGEN_USE_THREADS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 18d9de9e6..ce2337b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -430,12 +430,14 @@ class TensorAsyncExecutor std::function done) { TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done)); - // TODO(ezhulenev): This is a potentially blocking operation. Make it async! - const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr); - typedef EvalRange EvalRange; + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } - if (needs_assign) { + typedef EvalRange EvalRange; const StorageIndex size = array_prod(ctx->evaluator.dimensions()); device.parallelForAsync( size, ctx->evaluator.costPerCoeff(Vectorizable), @@ -444,7 +446,9 @@ class TensorAsyncExecutor EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); }, [ctx]() { delete ctx; }); - } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); } private: @@ -496,26 +500,32 @@ class TensorAsyncExecutorevaluator.evalSubExprsIfNeeded(nullptr); + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } - if (needs_assign) { ctx->tiling = - internal::GetTensorExecutorTilingContext(device, ctx->evaluator); + GetTensorExecutorTilingContext(device, ctx->evaluator); device.parallelForAsync( ctx->tiling.block_mapper.total_block_count(), ctx->tiling.cost, [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { ScalarNoConst* thread_buf = - ctx->tiling.template GetCurrentThreadBuffer(ctx->device); + ctx->tiling.template GetCurrentThreadBuffer( + ctx->device); for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf); + auto block = + ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf); ctx->evaluator.evalBlock(&block); } }, [ctx]() { delete ctx; }); - } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); } private: diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h index bae68e1fb..e4c59dc3d 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h @@ -25,6 +25,9 @@ class Barrier { void Notify() { unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; if (v != 1) { + // Clear the lowest bit (waiter flag) and check that the original state + // value was not zero. If it was zero, it means that notify was called + // more times than the original count. eigen_plain_assert(((v + 2) & ~1) != 0); return; // either count has not dropped to 0, or waiter is not waiting } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 53b50d1ed..62973cd08 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -330,6 +330,52 @@ static void test_multithread_contraction_with_output_kernel() { } } +template +void test_async_multithread_contraction_agrees_with_singlethread() +{ + int contract_size = internal::random(100, 500); + + Tensor left(internal::random(10, 40), + contract_size, + internal::random(10, 40)); + + Tensor right( + internal::random(1, 20), internal::random(1, 20), contract_size, + internal::random(1, 20)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 2)}}); + + Eigen::ThreadPool tp(internal::random(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(8, 32)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result(st_result.dimensions()); + + Eigen::Barrier barrier(1); + tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) = + left.contract(right, dims); + barrier.Wait(); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test + // will fail due to numerical precision issues when values are small) + if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + // We are triggering 'evalShardedByInnerDim' optimization. template static void test_sharded_by_inner_dim_contraction() @@ -410,6 +456,93 @@ static void test_sharded_by_inner_dim_contraction_with_output_kernel() } } +// We are triggering 'evalShardedByInnerDim' optimization. +template +static void test_async_sharded_by_inner_dim_contraction() +{ + typedef Tensor::DimensionPair DimPair; + + const int num_threads = internal::random(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor t_left(2, 10000); + Tensor t_right(10000, 10); + Tensor t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims); + barrier.Wait(); + + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template +static void test_async_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor::DimensionPair DimPair; + + const int num_threads = internal::random(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor t_left(2, 10000); + Tensor t_right(10000, 10); + Tensor t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims, SqrtOutputKernel()); + barrier.Wait(); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + template void test_full_contraction() { int contract_size1 = internal::random(1, 500); @@ -550,11 +683,18 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); + CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread()); + // Test EvalShardedByInnerDimContext parallelization strategy. CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction()); CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction()); CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel()); CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel()); + CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction()); + CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction()); + CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel()); + CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel()); // Exercise various cases that have been problematic in the past. CALL_SUBTEST_5(test_contraction_corner_cases()); From edf2ec28d864f1cc1c7d93e34e13333571f91565 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 30 Aug 2019 15:29:25 -0700 Subject: [PATCH 19/30] Fix block mapper type name in TensorExecutor --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index ce2337b63..10339e5e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -507,7 +507,7 @@ class TensorAsyncExecutortiling = - GetTensorExecutorTilingContext(device, ctx->evaluator); device.parallelForAsync( From 79c402e40e80b670eb6bbaae631d0a5694d720b8 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 30 Aug 2019 15:38:31 -0700 Subject: [PATCH 20/30] Fix shadow warnings in TensorContractionThreadPool --- .../src/Tensor/TensorContractionThreadPool.h | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index f9d9d6d31..4adfeb560 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -920,19 +920,19 @@ struct TensorEvaluator struct EvalShardedByInnerDimContext { - EvalShardedByInnerDimContext(const Self* evaluator, int num_threads, - Scalar* result, Index m, Index n, Index k, - DoneCallback done) - : evaluator(evaluator), + EvalShardedByInnerDimContext(const Self* self, int num_threads, + Scalar* result_buffer, + Index m_size, Index n_size, Index k_size, + DoneCallback done_callback) + : evaluator(self), m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), - num_threads(num_threads), - result(result), - m(m), - n(n), - k(k), - done(std::move(done)), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), buffer_size_bytes(m * n * sizeof(Scalar)), block_size(blockSize(k, num_threads)), num_blocks(divup(k, block_size)), @@ -1032,7 +1032,6 @@ struct TensorEvaluator Date: Sat, 8 Jun 2019 21:09:06 +0000 Subject: [PATCH 21/30] Updated Eigen_Colamd.h, namespacing macros ALIVE & DEAD as COLAMD_ALIVE & COLAMD_DEAD to prevent conflicts with other libraries / code. --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 67fcad3f7..2a4338393 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -105,8 +105,8 @@ namespace internal { #define COLAMD_EMPTY (-1) /* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) +#define COLAMD_ALIVE (0) +#define COLAMD_DEAD (-1) /* Column status */ #define DEAD_PRINCIPAL (-1) @@ -114,12 +114,12 @@ namespace internal { /* Macros for row and column status update and checking. */ #define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) +#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < COLAMD_ALIVE) +#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= COLAMD_ALIVE) +#define COL_IS_DEAD(c) (Col [c].start < COLAMD_ALIVE) +#define COL_IS_ALIVE(c) (Col [c].start >= COLAMD_ALIVE) #define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } +#define KILL_ROW(r) { Row [r].shared2.mark = COLAMD_DEAD ; } #define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } #define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } From 0a6b553ecf0716c735e19e829b5d1fb177ef36d6 Mon Sep 17 00:00:00 2001 From: Anshul Jaiswal Date: Sun, 21 Jul 2019 04:53:31 +0000 Subject: [PATCH 22/30] Eigen_Colamd.h edited online with Bitbucket replacing constant #defines with const definitions --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 363 +++++++++++------------ 1 file changed, 181 insertions(+), 182 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 2a4338393..bba7c67ac 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -13,37 +13,37 @@ // Davis (davis@cise.ufl.edu), University of Florida. The algorithm was // developed in collaboration with John Gilbert, Xerox PARC, and Esmond // Ng, Oak Ridge National Laboratory. -// +// // Date: -// +// // September 8, 2003. Version 2.3. -// +// // Acknowledgements: -// +// // This work was supported by the National Science Foundation, under // grants DMS-9504974 and DMS-9803599. -// +// // Notice: -// +// // Copyright (c) 1998-2003 by the University of Florida. // All Rights Reserved. -// +// // THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY // EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. -// +// // Permission is hereby granted to use, copy, modify, and/or distribute // this program, provided that the Copyright, this License, and the // Availability of the original version is retained on all copies and made // accessible to the end-user of any code or package that includes COLAMD -// or any modified version of COLAMD. -// +// or any modified version of COLAMD. +// // Availability: -// +// // The colamd/symamd library is available at -// +// // http://www.suitesparse.com - + #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H @@ -57,42 +57,42 @@ namespace internal { /* ========================================================================== */ /* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 +const size_t ColamdKnobs = 20; /* number of output statistics. Only stats [0..6] are currently used. */ -#define COLAMD_STATS 20 +const size_t ColamdStats = 20; /* knobs [0] and stats [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 +const size_t ColamdDenseRow = 0; /* knobs [1] and stats [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 +const size_t ColamdDenseCol = 1; /* stats [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 +const size_t ColamdDefragCount = 2; /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -#define COLAMD_STATUS 3 +const size_t ColamdStatus = 3; -/* stats [4..6]: error info, or info on jumbled columns */ -#define COLAMD_INFO1 4 -#define COLAMD_INFO2 5 -#define COLAMD_INFO3 6 +/* stats [4..6]: error info, or info on jumbled columns */ +const size_t ColamdInfo1 = 4; +const size_t ColamdInfo2 = 5; +const size_t ColamdInfo3 = 6; /* error codes returned in stats [3]: */ -#define COLAMD_OK (0) -#define COLAMD_OK_BUT_JUMBLED (1) -#define COLAMD_ERROR_A_not_present (-1) -#define COLAMD_ERROR_p_not_present (-2) -#define COLAMD_ERROR_nrow_negative (-3) -#define COLAMD_ERROR_ncol_negative (-4) -#define COLAMD_ERROR_nnz_negative (-5) -#define COLAMD_ERROR_p0_nonzero (-6) -#define COLAMD_ERROR_A_too_small (-7) -#define COLAMD_ERROR_col_length_negative (-8) -#define COLAMD_ERROR_row_index_out_of_bounds (-9) -#define COLAMD_ERROR_out_of_memory (-10) -#define COLAMD_ERROR_internal_error (-999) +const int ColamdOk = 0; +const int ColamdOkButJumbled = 1; +const int ColamdErrorANotPresent = -1; +const int ColamdErrorPNotPresent = -2; +const int ColamdErrorNrowNegative = -3; +const int ColamdErrorNcolNegative = -4; +const int ColamdErrorNnzNegative = -5; +const int ColamdErrorP0Nonzero = -6; +const int ColamdErrorATooSmall = -7; +const int ColamdErrorColLengthNegative = -8; +const int ColamdErrorRowIndexOutOfBounds = -9; +const int ColamdErrorOutOfMemory = -10; +const int ColamdErrorInternalError = -999; /* ========================================================================== */ /* === Definitions ========================================================== */ @@ -101,27 +101,26 @@ namespace internal { #define ONES_COMPLEMENT(r) (-(r)-1) /* -------------------------------------------------------------------------- */ - -#define COLAMD_EMPTY (-1) +const int ColamdEmpty = -1; /* Row and column status */ -#define COLAMD_ALIVE (0) -#define COLAMD_DEAD (-1) +const int ColamdAlive = 0; +const int ColamdDead = -1; /* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) +const int ColamdDeadPrincipal = -1; +const int ColamdDeadNonPrincipal = -2; /* Macros for row and column status update and checking. */ #define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < COLAMD_ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= COLAMD_ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < COLAMD_ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= COLAMD_ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = COLAMD_DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } +#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ColamdAlive) +#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ColamdAlive) +#define COL_IS_DEAD(c) (Col [c].start < ColamdAlive) +#define COL_IS_ALIVE(c) (Col [c].start >= ColamdAlive) +#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == ColamdDeadPrincipal) +#define KILL_ROW(r) { Row [r].shared2.mark = ColamdDead ; } +#define KILL_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadPrincipal ; } +#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadNonPrincipal ; } /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -131,7 +130,7 @@ namespace internal { template struct colamd_col { - IndexType start ; /* index for A of first row in this column, or DEAD */ + IndexType start ; /* index for A of first row in this column, or ColamdDead */ /* if column is dead */ IndexType length ; /* number of rows in this column */ union @@ -159,9 +158,9 @@ struct colamd_col IndexType degree_next ; /* next column, if col is in a degree list */ IndexType hash_next ; /* next column, if col is in a hash list */ } shared4 ; - + }; - + template struct Colamd_Row { @@ -177,13 +176,13 @@ struct Colamd_Row IndexType mark ; /* for computing set differences and marking dead rows*/ IndexType first_column ;/* first column in row (used in garbage collection) */ } shared2 ; - + }; - + /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ - + /* The recommended length Alen of the array A passed to colamd is given by the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any @@ -192,14 +191,14 @@ struct Colamd_Row required for the Col and Row arrays, respectively, which are internal to colamd. An additional n_col space is the minimal amount of "elbow room", and nnz/5 more space is recommended for run time efficiency. - + This macro is not needed when using symamd. - + Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid gcc -pedantic warning messages. */ template -inline IndexType colamd_c(IndexType n_col) +inline IndexType colamd_c(IndexType n_col) { return IndexType( ((n_col) + 1) * sizeof (colamd_col) / sizeof (IndexType) ) ; } template @@ -208,10 +207,10 @@ inline IndexType colamd_r(IndexType n_row) // Prototypes of non-user callable routines template -static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); +static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[ColamdStats] ); template -static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); +static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[ColamdKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); template static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); @@ -240,14 +239,14 @@ static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row /** - * \brief Returns the recommended value of Alen - * - * Returns recommended value of Alen for use by colamd. - * Returns -1 if any input argument is negative. - * The use of this routine or macro is optional. - * Note that the macro uses its arguments more than once, - * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. - * + * \brief Returns the recommended value of Alen + * + * Returns recommended value of Alen for use by colamd. + * Returns -1 if any input argument is negative. + * The use of this routine or macro is optional. + * Note that the macro uses its arguments more than once, + * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. + * * \param nnz nonzeros in A * \param n_row number of rows in A * \param n_col number of columns in A @@ -259,18 +258,18 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0) return (-1); else - return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); + return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); } /** * \brief set default parameters The use of this routine is optional. - * - * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) - * entries are removed prior to ordering. Columns with more than - * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to - * ordering, and placed last in the output column ordering. * - * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1, + * Colamd: rows with more than (knobs [ColamdDenseRow] * n_col) + * entries are removed prior to ordering. Columns with more than + * (knobs [ColamdDenseCol] * n_row) entries are removed prior to + * ordering, and placed last in the output column ordering. + * + * ColamdDenseRow and ColamdDenseCol are defined as 0 and 1, * respectively, in colamd.h. Default values of these two knobs * are both 0.5. Currently, only knobs [0] and knobs [1] are * used, but future versions may use more knobs. If so, they will @@ -279,37 +278,37 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType * not need to change, assuming that you either use * colamd_set_defaults, or pass a (double *) NULL pointer as the * knobs array to colamd or symamd. - * + * * \param knobs parameter settings for colamd */ -static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) +static inline void colamd_set_defaults(double knobs[ColamdKnobs]) { /* === Local variables ================================================== */ - + int i ; if (!knobs) { return ; /* no knobs to initialize */ } - for (i = 0 ; i < COLAMD_KNOBS ; i++) + for (i = 0 ; i < ColamdKnobs ; i++) { knobs [i] = 0 ; } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ + knobs [ColamdDenseRow] = 0.5 ; /* ignore rows over 50% dense */ + knobs [ColamdDenseCol] = 0.5 ; /* ignore columns over 50% dense */ } -/** +/** * \brief Computes a column ordering using the column approximate minimum degree ordering - * + * * Computes a column ordering (Q) of A such that P(AQ)=LU or * (AQ)'AQ=LL' have less fill-in and require fewer floating point * operations than factorizing the unpermuted matrix A or A'A, * respectively. - * - * + * + * * \param n_row number of rows in A * \param n_col number of columns in A * \param Alen, size of the array A @@ -319,10 +318,10 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) * \param stats colamd output statistics and error codes */ template -static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS]) +static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[ColamdKnobs], IndexType stats[ColamdStats]) { /* === Local variables ================================================== */ - + IndexType i ; /* loop index */ IndexType nnz ; /* nonzeros in A */ IndexType Row_size ; /* size of Row [], in integers */ @@ -334,128 +333,128 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * IndexType n_row2 ; /* number of non-dense, non-empty rows */ IndexType ngarbage ; /* number of garbage collections performed */ IndexType max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ - - + double default_knobs [ColamdKnobs] ; /* default knobs array */ + + /* === Check the input arguments ======================================== */ - + if (!stats) { COLAMD_DEBUG0 (("colamd: stats not present\n")) ; return (false) ; } - for (i = 0 ; i < COLAMD_STATS ; i++) + for (i = 0 ; i < ColamdStats ; i++) { stats [i] = 0 ; } - stats [COLAMD_STATUS] = COLAMD_OK ; - stats [COLAMD_INFO1] = -1 ; - stats [COLAMD_INFO2] = -1 ; - + stats [ColamdStatus] = ColamdOk ; + stats [ColamdInfo1] = -1 ; + stats [ColamdInfo2] = -1 ; + if (!A) /* A is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; + stats [ColamdStatus] = ColamdErrorANotPresent ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } - + if (!p) /* p is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; + stats [ColamdStatus] = ColamdErrorPNotPresent ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } - + if (n_row < 0) /* n_row must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; - stats [COLAMD_INFO1] = n_row ; + stats [ColamdStatus] = ColamdErrorNrowNegative ; + stats [ColamdInfo1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } - + if (n_col < 0) /* n_col must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; - stats [COLAMD_INFO1] = n_col ; + stats [ColamdStatus] = ColamdErrorNcolNegative ; + stats [ColamdInfo1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } - + nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; - stats [COLAMD_INFO1] = nnz ; + stats [ColamdStatus] = ColamdErrorNnzNegative ; + stats [ColamdInfo1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } - + if (p [0] != 0) { - stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; - stats [COLAMD_INFO1] = p [0] ; + stats [ColamdStatus] = ColamdErrorP0Nonzero ; + stats [ColamdInfo1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } - + /* === If no knobs, set default knobs =================================== */ - + if (!knobs) { colamd_set_defaults (default_knobs) ; knobs = default_knobs ; } - + /* === Allocate the Row and Col arrays from array A ===================== */ - + Col_size = colamd_c (n_col) ; Row_size = colamd_r (n_row) ; need = 2*nnz + n_col + Col_size + Row_size ; - + if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; - stats [COLAMD_INFO1] = need ; - stats [COLAMD_INFO2] = Alen ; + stats [ColamdStatus] = ColamdErrorATooSmall ; + stats [ColamdInfo1] = need ; + stats [ColamdInfo2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } - + Alen -= Col_size + Row_size ; Col = (colamd_col *) &A [Alen] ; Row = (Colamd_Row *) &A [Alen + Col_size] ; /* === Construct the row and column data structures ===================== */ - + if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) { /* input matrix is invalid */ COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ; return (false) ; } - + /* === Initialize scores, kill dense rows/columns ======================= */ Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg) ; - + /* === Order the supercolumns =========================================== */ - + ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2*nnz) ; - + /* === Order the non-principal columns ================================== */ - + Eigen::internal::order_children (n_col, Col, p) ; - + /* === Return statistics in stats ======================================= */ - - stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; - stats [COLAMD_DENSE_COL] = n_col - n_col2 ; - stats [COLAMD_DEFRAG_COUNT] = ngarbage ; - COLAMD_DEBUG0 (("colamd: done.\n")) ; + + stats [ColamdDenseRow] = n_row - n_row2 ; + stats [ColamdDenseCol] = n_col - n_col2 ; + stats [ColamdDefragCount] = ngarbage ; + COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -489,7 +488,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A, of size Alen */ IndexType p [], /* pointers to columns in A, of size n_col+1 */ - IndexType stats [COLAMD_STATS] /* colamd statistics */ + IndexType stats [ColamdStats] /* colamd statistics */ ) { /* === Local variables ================================================== */ @@ -512,24 +511,24 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = Col [col].length ; + stats [ColamdStatus] = ColamdErrorColLengthNegative ; + stats [ColamdInfo1] = col ; + stats [ColamdInfo2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } Col [col].shared1.thickness = 1 ; Col [col].shared2.score = 0 ; - Col [col].shared3.prev = COLAMD_EMPTY ; - Col [col].shared4.degree_next = COLAMD_EMPTY ; + Col [col].shared3.prev = ColamdEmpty ; + Col [col].shared4.degree_next = ColamdEmpty ; } /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ /* === Scan columns, compute row degrees, and check row indices ========= */ - stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ + stats [ColamdInfo3] = 0 ; /* number of duplicate or unsorted row indices*/ for (row = 0 ; row < n_row ; row++) { @@ -551,10 +550,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - stats [COLAMD_INFO3] = n_row ; + stats [ColamdStatus] = ColamdErrorRowIndexOutOfBounds ; + stats [ColamdInfo1] = col ; + stats [ColamdInfo2] = row ; + stats [ColamdInfo3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -563,10 +562,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - (stats [COLAMD_INFO3]) ++ ; + stats [ColamdStatus] = ColamdOkButJumbled ; + stats [ColamdInfo1] = col ; + stats [ColamdInfo2] = row ; + (stats [ColamdInfo3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -604,7 +603,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === Create row form ================================================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [ColamdStatus] == ColamdOkButJumbled) { /* if cols jumbled, watch for repeated row indices */ for (col = 0 ; col < n_col ; col++) @@ -646,7 +645,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === See if we need to re-create columns ============================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [ColamdStatus] == ColamdOkButJumbled) { COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; @@ -705,7 +704,7 @@ static void init_scoring colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ + double knobs [ColamdKnobs],/* parameters */ IndexType *p_n_row2, /* number of non-dense, non-empty rows */ IndexType *p_n_col2, /* number of non-dense, non-empty columns */ IndexType *p_max_deg /* maximum row degree */ @@ -732,8 +731,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseRow] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseCol] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -870,7 +869,7 @@ static void init_scoring /* clear the hash buckets */ for (c = 0 ; c <= n_col ; c++) { - head [c] = COLAMD_EMPTY ; + head [c] = ColamdEmpty ; } min_score = n_col ; /* place in reverse order, so low column indices are at the front */ @@ -891,16 +890,16 @@ static void init_scoring COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (score >= 0) ; COLAMD_ASSERT (score <= n_col) ; - COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [score] >= ColamdEmpty) ; /* now add this column to dList at proper score location */ next_col = head [score] ; - Col [c].shared3.prev = COLAMD_EMPTY ; + Col [c].shared3.prev = ColamdEmpty ; Col [c].shared4.degree_next = next_col ; /* if there already was a column with the same score, set its */ /* previous pointer to this new column */ - if (next_col != COLAMD_EMPTY) + if (next_col != ColamdEmpty) { Col [next_col].shared3.prev = c ; } @@ -1001,10 +1000,10 @@ static IndexType find_ordering /* return the number of garbage collections */ /* make sure degree list isn't empty */ COLAMD_ASSERT (min_score >= 0) ; COLAMD_ASSERT (min_score <= n_col) ; - COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [min_score] >= ColamdEmpty) ; /* get pivot column from head of minimum degree list */ - while (min_score < n_col && head [min_score] == COLAMD_EMPTY) + while (min_score < n_col && head [min_score] == ColamdEmpty) { min_score++ ; } @@ -1012,9 +1011,9 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; next_col = Col [pivot_col].shared4.degree_next ; head [min_score] = next_col ; - if (next_col != COLAMD_EMPTY) + if (next_col != ColamdEmpty) { - Col [next_col].shared3.prev = COLAMD_EMPTY ; + Col [next_col].shared3.prev = ColamdEmpty ; } COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ; @@ -1120,7 +1119,7 @@ static IndexType find_ordering /* return the number of garbage collections */ else { /* there is no pivot row, since it is of zero length */ - pivot_row = COLAMD_EMPTY ; + pivot_row = ColamdEmpty ; COLAMD_ASSERT (pivot_row_length == 0) ; } COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; @@ -1172,8 +1171,8 @@ static IndexType find_ordering /* return the number of garbage collections */ next_col = Col [col].shared4.degree_next ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ; - if (prev_col == COLAMD_EMPTY) + COLAMD_ASSERT (cur_score >= ColamdEmpty) ; + if (prev_col == ColamdEmpty) { head [cur_score] = next_col ; } @@ -1181,7 +1180,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { Col [prev_col].shared4.degree_next = next_col ; } - if (next_col != COLAMD_EMPTY) + if (next_col != ColamdEmpty) { Col [next_col].shared3.prev = prev_col ; } @@ -1302,7 +1301,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (hash <= n_col) ; head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > ColamdEmpty) { /* degree list "hash" is non-empty, use prev (shared3) of */ /* first column in degree list as head of hash bucket */ @@ -1391,11 +1390,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [cur_score] >= ColamdEmpty) ; next_col = head [cur_score] ; Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = COLAMD_EMPTY ; - if (next_col != COLAMD_EMPTY) + Col [col].shared3.prev = ColamdEmpty ; + if (next_col != ColamdEmpty) { Col [next_col].shared3.prev = col ; } @@ -1465,7 +1464,7 @@ static inline void order_children { /* find an un-ordered non-principal column */ COLAMD_ASSERT (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY) + if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == ColamdEmpty) { parent = i ; /* once found, find its principal parent */ @@ -1482,7 +1481,7 @@ static inline void order_children do { - COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ; + COLAMD_ASSERT (Col [c].shared2.order == ColamdEmpty) ; /* order this column */ Col [c].shared2.order = order++ ; @@ -1495,7 +1494,7 @@ static inline void order_children /* continue until we hit an ordered column. There are */ /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ - } while (Col [c].shared2.order == COLAMD_EMPTY) ; + } while (Col [c].shared2.order == ColamdEmpty) ; /* re-order the super_col parent to largest order for this group */ Col [parent].shared2.order = order ; @@ -1547,7 +1546,7 @@ template static void detect_super_cols ( /* === Parameters ======================================================= */ - + colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A */ IndexType head [], /* head of degree lists and hash buckets */ @@ -1590,7 +1589,7 @@ static void detect_super_cols /* === Get the first column in this hash bucket ===================== */ head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > ColamdEmpty) { first_col = Col [head_column].shared3.headhash ; } @@ -1601,7 +1600,7 @@ static void detect_super_cols /* === Consider each column in the hash bucket ====================== */ - for (super_c = first_col ; super_c != COLAMD_EMPTY ; + for (super_c = first_col ; super_c != ColamdEmpty ; super_c = Col [super_c].shared4.hash_next) { COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ; @@ -1614,7 +1613,7 @@ static void detect_super_cols /* === Compare super_c with all columns after it ================ */ for (c = Col [super_c].shared4.hash_next ; - c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next) + c != ColamdEmpty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; COLAMD_ASSERT (COL_IS_ALIVE (c)) ; @@ -1660,7 +1659,7 @@ static void detect_super_cols Col [c].shared1.parent = super_c ; KILL_NON_PRINCIPAL_COL (c) ; /* order c later, in order_children() */ - Col [c].shared2.order = COLAMD_EMPTY ; + Col [c].shared2.order = ColamdEmpty ; /* remove c from hash bucket */ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; } @@ -1668,15 +1667,15 @@ static void detect_super_cols /* === Empty this hash bucket ======================================= */ - if (head_column > COLAMD_EMPTY) + if (head_column > ColamdEmpty) { /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = COLAMD_EMPTY ; + Col [head_column].shared3.headhash = ColamdEmpty ; } else { /* corresponding degree list "hash" is empty */ - head [hash] = COLAMD_EMPTY ; + head [hash] = ColamdEmpty ; } } } @@ -1698,7 +1697,7 @@ template static IndexType garbage_collection /* returns the new value of pfree */ ( /* === Parameters ======================================================= */ - + IndexType n_row, /* number of rows */ IndexType n_col, /* number of columns */ Colamd_Row Row [], /* row info */ @@ -1839,5 +1838,5 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ } -} // namespace internal +} // namespace internal #endif From 39f30923c29c77c3a17c77b9f59dbc73291cf02a Mon Sep 17 00:00:00 2001 From: Anshul Jaiswal Date: Thu, 15 Aug 2019 20:15:19 +0000 Subject: [PATCH 23/30] Eigen_Colamd.h edited replacing macros with constexprs and functions. --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 208 ++++++++++++++--------- 1 file changed, 126 insertions(+), 82 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index bba7c67ac..aec383abf 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -47,6 +47,16 @@ #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H +/* ========================================================================== */ +/* === Knob and statistics definitions used elsewhere ======================= */ +/* ========================================================================== */ + +/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ +constexpr size_t ColamdKnobs = 20; + +/* number of output statistics. Only stats [0..6] are currently used. */ +constexpr size_t ColamdStats = 20; + namespace internal { /* Ensure that debugging is turned off: */ #ifndef COLAMD_NDEBUG @@ -56,71 +66,59 @@ namespace internal { /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ -/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -const size_t ColamdKnobs = 20; - -/* number of output statistics. Only stats [0..6] are currently used. */ -const size_t ColamdStats = 20; - /* knobs [0] and stats [0]: dense row knob and output statistic. */ -const size_t ColamdDenseRow = 0; +constexpr size_t ColamdDenseRow = 0; /* knobs [1] and stats [1]: dense column knob and output statistic. */ -const size_t ColamdDenseCol = 1; +constexpr size_t ColamdDenseCol = 1; /* stats [2]: memory defragmentation count output statistic */ -const size_t ColamdDefragCount = 2; +constexpr size_t ColamdDefragCount = 2; /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -const size_t ColamdStatus = 3; +constexpr size_t ColamdStatus = 3; /* stats [4..6]: error info, or info on jumbled columns */ -const size_t ColamdInfo1 = 4; -const size_t ColamdInfo2 = 5; -const size_t ColamdInfo3 = 6; +constexpr size_t ColamdInfo1 = 4; +constexpr size_t ColamdInfo2 = 5; +constexpr size_t ColamdInfo3 = 6; /* error codes returned in stats [3]: */ -const int ColamdOk = 0; -const int ColamdOkButJumbled = 1; -const int ColamdErrorANotPresent = -1; -const int ColamdErrorPNotPresent = -2; -const int ColamdErrorNrowNegative = -3; -const int ColamdErrorNcolNegative = -4; -const int ColamdErrorNnzNegative = -5; -const int ColamdErrorP0Nonzero = -6; -const int ColamdErrorATooSmall = -7; -const int ColamdErrorColLengthNegative = -8; -const int ColamdErrorRowIndexOutOfBounds = -9; -const int ColamdErrorOutOfMemory = -10; -const int ColamdErrorInternalError = -999; +constexpr int ColamdOk = 0; +constexpr int ColamdOkButJumbled = 1; +constexpr int ColamdErrorANotPresent = -1; +constexpr int ColamdErrorPNotPresent = -2; +constexpr int ColamdErrorNrowNegative = -3; +constexpr int ColamdErrorNcolNegative = -4; +constexpr int ColamdErrorNnzNegative = -5; +constexpr int ColamdErrorP0Nonzero = -6; +constexpr int ColamdErrorATooSmall = -7; +constexpr int ColamdErrorColLengthNegative = -8; +constexpr int ColamdErrorRowIndexOutOfBounds = -9; +constexpr int ColamdErrorOutOfMemory = -10; +constexpr int ColamdErrorInternalError = -999; /* ========================================================================== */ /* === Definitions ========================================================== */ /* ========================================================================== */ -#define ONES_COMPLEMENT(r) (-(r)-1) +// #define ONES_COMPLEMENT(r) (-(r)-1) + +template +IndexType ones_complement(const IndexType r) { + return (-(r)-1); +} /* -------------------------------------------------------------------------- */ -const int ColamdEmpty = -1; +constexpr int ColamdEmpty = -1; /* Row and column status */ -const int ColamdAlive = 0; -const int ColamdDead = -1; +constexpr int ColamdAlive = 0; +constexpr int ColamdDead = -1; /* Column status */ -const int ColamdDeadPrincipal = -1; -const int ColamdDeadNonPrincipal = -2; - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ColamdAlive) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ColamdAlive) -#define COL_IS_DEAD(c) (Col [c].start < ColamdAlive) -#define COL_IS_ALIVE(c) (Col [c].start >= ColamdAlive) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == ColamdDeadPrincipal) -#define KILL_ROW(r) { Row [r].shared2.mark = ColamdDead ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadPrincipal ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadNonPrincipal ; } +constexpr int ColamdDeadPrincipal = -1; +constexpr int ColamdDeadNonPrincipal = -2; /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -179,6 +177,52 @@ struct Colamd_Row }; +/* Methods for row and column status update and checking. */ +template +bool row_is_marked_dead(const IndexType row_mark) { + return row_mark < ColamdAlive; +} + +template +bool row_is_dead(const Colamd_Row* row, const IndexType r) { + return row_is_marked_dead(row[r].shared2.mark); +} + +template +bool row_is_alive(const Colamd_Row* row, const IndexType r) { + return row[r].shared2.mark >= ColamdAlive; +} + +template +void kill_row(Colamd_Row* row, const IndexType r) { + row[r].shared2.mark = ColamdDead; +} + +template +bool col_is_dead(const colamd_col* col, const IndexType c) { + return col[c].start < ColamdAlive; +} + +template +bool col_is_alive(const colamd_col* col, const IndexType c) { + return col[c].start >= ColamdAlive; +} + +template +bool col_is_dead_principal(const colamd_col* col, const IndexType c) { + return col[c].start == ColamdDeadPrincipal; +} + +template +void kill_principal_col(colamd_col* col, const IndexType c) { + col[c].start = ColamdDeadPrincipal; +} + +template +void kill_non_principal_col(colamd_col* col, const IndexType c) { + col[c].start = ColamdDeadNonPrincipal; +} + /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ @@ -749,7 +793,7 @@ static void init_scoring { /* this is a empty column, kill and order it last */ Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + kill_principal_col(Col, c) ; } } COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; @@ -760,7 +804,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip any dead columns */ - if (COL_IS_DEAD (c)) + if (col_is_dead(Col, c)) { continue ; } @@ -776,7 +820,7 @@ static void init_scoring { Row [*cp++].shared1.degree-- ; } - KILL_PRINCIPAL_COL (c) ; + kill_principal_col(Col, c) ; } } COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; @@ -790,7 +834,7 @@ static void init_scoring if (deg > dense_row_count || deg == 0) { /* kill a dense or empty row */ - KILL_ROW (r) ; + kill_row(Row, r) ; --n_row2 ; } else @@ -812,7 +856,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip dead column */ - if (COL_IS_DEAD (c)) + if (col_is_dead(Col, c)) { continue ; } @@ -825,7 +869,7 @@ static void init_scoring /* get a row */ row = *cp++ ; /* skip if dead */ - if (ROW_IS_DEAD (row)) + if (row_is_dead(Row, row)) { continue ; } @@ -844,7 +888,7 @@ static void init_scoring /* and have already been killed) */ COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ; Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + kill_principal_col(Col, c) ; } else { @@ -877,7 +921,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) + if (col_is_alive(Col, c)) { COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n", c, Col [c].shared2.score, min_score, n_col)) ; @@ -1016,7 +1060,7 @@ static IndexType find_ordering /* return the number of garbage collections */ Col [next_col].shared3.prev = ColamdEmpty ; } - COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ; + COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ; COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ; /* remember score for defrag check */ @@ -1063,9 +1107,9 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; + COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", row_is_alive(Row, row), row)) ; /* skip if row is dead */ - if (ROW_IS_DEAD (row)) + if (row_is_dead(Row, row)) { continue ; } @@ -1077,7 +1121,7 @@ static IndexType find_ordering /* return the number of garbage collections */ col = *rp++ ; /* add the column, if alive and untagged */ col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) + if (col_thickness > 0 && col_is_alive(Col, col)) { /* tag column in pivot row */ Col [col].shared1.thickness = -col_thickness ; @@ -1104,7 +1148,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* may be killing an already dead row */ row = *cp++ ; COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; + kill_row(Row, row) ; } /* === Select a row index to use as the new pivot row =============== */ @@ -1156,7 +1200,7 @@ static IndexType find_ordering /* return the number of garbage collections */ while (rp < rp_end) { col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ; COLAMD_DEBUG3 (("Col: %d\n", col)) ; /* clear tags used to construct pivot row pattern */ @@ -1195,7 +1239,7 @@ static IndexType find_ordering /* return the number of garbage collections */ row = *cp++ ; row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (row_is_marked_dead (row_mark)) { continue ; } @@ -1214,7 +1258,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (set_difference == 0) { COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; + kill_row(Row, row) ; } else { @@ -1236,7 +1280,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a column */ col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ; hash = 0 ; cur_score = 0 ; cp = &A [Col [col].start] ; @@ -1253,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT(row >= 0 && row < n_row) ; row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (row_is_marked_dead (row_mark)) { continue ; } @@ -1277,7 +1321,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ; /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; + kill_principal_col(Col, col) ; pivot_row_degree -= Col [col].shared1.thickness ; COLAMD_ASSERT (pivot_row_degree >= 0) ; /* order it */ @@ -1318,7 +1362,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* save hash function in Col [col].shared3.hash */ Col [col].shared3.hash = (IndexType) hash ; - COLAMD_ASSERT (COL_IS_ALIVE (col)) ; + COLAMD_ASSERT (col_is_alive(Col, col)) ; } } @@ -1332,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Kill the pivotal column ====================================== */ - KILL_PRINCIPAL_COL (pivot_col) ; + kill_principal_col(Col, pivot_col) ; /* === Clear mark =================================================== */ @@ -1356,7 +1400,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { col = *rp++ ; /* skip dead columns */ - if (COL_IS_DEAD (col)) + if (col_is_dead(Col, col)) { continue ; } @@ -1463,15 +1507,15 @@ static inline void order_children for (i = 0 ; i < n_col ; i++) { /* find an un-ordered non-principal column */ - COLAMD_ASSERT (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == ColamdEmpty) + COLAMD_ASSERT (col_is_dead(Col, i)) ; + if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == ColamdEmpty) { parent = i ; /* once found, find its principal parent */ do { parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; + } while (!col_is_dead_principal(Col, parent)) ; /* now, order all un-ordered non-principal columns along path */ /* to this parent. collapse tree at the same time */ @@ -1577,7 +1621,7 @@ static void detect_super_cols while (rp < rp_end) { col = *rp++ ; - if (COL_IS_DEAD (col)) + if (col_is_dead(Col, col)) { continue ; } @@ -1603,7 +1647,7 @@ static void detect_super_cols for (super_c = first_col ; super_c != ColamdEmpty ; super_c = Col [super_c].shared4.hash_next) { - COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ; + COLAMD_ASSERT (col_is_alive(Col, super_c)) ; COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ; length = Col [super_c].length ; @@ -1616,7 +1660,7 @@ static void detect_super_cols c != ColamdEmpty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; - COLAMD_ASSERT (COL_IS_ALIVE (c)) ; + COLAMD_ASSERT (col_is_alive(Col, c)) ; COLAMD_ASSERT (Col [c].shared3.hash == hash) ; /* not identical if lengths or scores are different */ @@ -1657,7 +1701,7 @@ static void detect_super_cols Col [super_c].shared1.thickness += Col [c].shared1.thickness ; Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; + kill_non_principal_col(Col, c) ; /* order c later, in order_children() */ Col [c].shared2.order = ColamdEmpty ; /* remove c from hash bucket */ @@ -1720,7 +1764,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ pdest = &A[0] ; for (c = 0 ; c < n_col ; c++) { - if (COL_IS_ALIVE (c)) + if (col_is_alive(Col, c)) { psrc = &A [Col [c].start] ; @@ -1731,7 +1775,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { r = *psrc++ ; - if (ROW_IS_ALIVE (r)) + if (row_is_alive(Row, r)) { *pdest++ = r ; } @@ -1744,22 +1788,22 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (row_is_alive(Row, r)) { if (Row [r].length == 0) { /* this row is of zero length. cannot compact it, so kill it */ COLAMD_DEBUG3 (("Defrag row kill\n")) ; - KILL_ROW (r) ; + kill_row(Row, r) ; } else { /* save first column index in Row [r].shared2.first_column */ psrc = &A [Row [r].start] ; Row [r].shared2.first_column = *psrc ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; + COLAMD_ASSERT (row_is_alive(Row, r)) ; /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; + *psrc = ones_complement(r) ; } } @@ -1775,11 +1819,11 @@ static IndexType garbage_collection /* returns the new value of pfree */ { psrc-- ; /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; + r = ones_complement(*psrc) ; COLAMD_ASSERT (r >= 0 && r < n_row) ; /* restore first column index */ *psrc = Row [r].shared2.first_column ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; + COLAMD_ASSERT (row_is_alive(Row, r)) ; /* move and compact the row */ COLAMD_ASSERT (pdest <= psrc) ; @@ -1788,7 +1832,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { c = *psrc++ ; - if (COL_IS_ALIVE (c)) + if (col_is_alive(Col, c)) { *pdest++ = c ; } @@ -1829,7 +1873,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (row_is_alive(Row, r)) { Row [r].shared2.mark = 0 ; } From 283558face1688a69683b1124142325a3ac4855a Mon Sep 17 00:00:00 2001 From: Anshul Jaiswal Date: Thu, 15 Aug 2019 20:21:56 +0000 Subject: [PATCH 24/30] Ordering.h edited to fix dependencies on Eigen_Colamd.h --- Eigen/src/OrderingMethods/Ordering.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 8791158be..10ba6b464 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -131,8 +131,8 @@ class COLAMDOrdering // Get the recommended value of Alen to be used by colamd StorageIndex Alen = internal::colamd_recommended(nnz, m, n); // Set the default parameters - double knobs [COLAMD_KNOBS]; - StorageIndex stats [COLAMD_STATS]; + double knobs [ColamdKnobs]; + StorageIndex stats [ColamdStats]; internal::colamd_set_defaults(knobs); IndexVector p(n+1), A(Alen); From a4d1a6cd7de5112e1b2aca1eaf76b06ed1619c81 Mon Sep 17 00:00:00 2001 From: Anshul Jaiswal Date: Sat, 17 Aug 2019 05:29:23 +0000 Subject: [PATCH 25/30] Eigen_Colamd.h updated to replace constexpr with consts and enums. --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 230 ++++++++++++----------- 1 file changed, 118 insertions(+), 112 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index aec383abf..3c9e85aa8 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -52,10 +52,10 @@ /* ========================================================================== */ /* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -constexpr size_t ColamdKnobs = 20; +const size_t ColamdKnobs = 20; /* number of output statistics. Only stats [0..6] are currently used. */ -constexpr size_t ColamdStats = 20; +const size_t ColamdStats = 20; namespace internal { /* Ensure that debugging is turned off: */ @@ -66,59 +66,65 @@ namespace internal { /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ -/* knobs [0] and stats [0]: dense row knob and output statistic. */ -constexpr size_t ColamdDenseRow = 0; +/* Indices into knobs and stats array. */ +enum KnobsStatsIndex { + /* knobs [0] and stats [0]: dense row knob and output statistic. */ + DenseRow = 0, -/* knobs [1] and stats [1]: dense column knob and output statistic. */ -constexpr size_t ColamdDenseCol = 1; + /* knobs [1] and stats [1]: dense column knob and output statistic. */ + DenseCol = 1, -/* stats [2]: memory defragmentation count output statistic */ -constexpr size_t ColamdDefragCount = 2; + /* stats [2]: memory defragmentation count output statistic */ + DefragCount = 2, -/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -constexpr size_t ColamdStatus = 3; + /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ + Status = 3, -/* stats [4..6]: error info, or info on jumbled columns */ -constexpr size_t ColamdInfo1 = 4; -constexpr size_t ColamdInfo2 = 5; -constexpr size_t ColamdInfo3 = 6; + /* stats [4..6]: error info, or info on jumbled columns */ + Info1 = 4, + Info2 = 5, + Info3 = 6 +}; /* error codes returned in stats [3]: */ -constexpr int ColamdOk = 0; -constexpr int ColamdOkButJumbled = 1; -constexpr int ColamdErrorANotPresent = -1; -constexpr int ColamdErrorPNotPresent = -2; -constexpr int ColamdErrorNrowNegative = -3; -constexpr int ColamdErrorNcolNegative = -4; -constexpr int ColamdErrorNnzNegative = -5; -constexpr int ColamdErrorP0Nonzero = -6; -constexpr int ColamdErrorATooSmall = -7; -constexpr int ColamdErrorColLengthNegative = -8; -constexpr int ColamdErrorRowIndexOutOfBounds = -9; -constexpr int ColamdErrorOutOfMemory = -10; -constexpr int ColamdErrorInternalError = -999; - +enum Status { + Ok = 0, + OkButJumbled = 1, + ErrorANotPresent = -1, + ErrorPNotPresent = -2, + ErrorNrowNegative = -3, + ErrorNcolNegative = -4, + ErrorNnzNegative = -5, + ErrorP0Nonzero = -6, + ErrorATooSmall = -7, + ErrorColLengthNegative = -8, + ErrorRowIndexOutOfBounds = -9, + ErrorOutOfMemory = -10, + ErrorInternalError = -999 +}; /* ========================================================================== */ /* === Definitions ========================================================== */ /* ========================================================================== */ -// #define ONES_COMPLEMENT(r) (-(r)-1) - template IndexType ones_complement(const IndexType r) { return (-(r)-1); } /* -------------------------------------------------------------------------- */ -constexpr int ColamdEmpty = -1; +const int Empty = -1; /* Row and column status */ -constexpr int ColamdAlive = 0; -constexpr int ColamdDead = -1; +enum RowColumnStatus { + Alive = 0, + Dead = -1 +}; /* Column status */ -constexpr int ColamdDeadPrincipal = -1; -constexpr int ColamdDeadNonPrincipal = -2; +enum ColumnStatus { + DeadPrincipal = -1, + DeadNonPrincipal = -2 +}; /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -128,7 +134,7 @@ constexpr int ColamdDeadNonPrincipal = -2; template struct colamd_col { - IndexType start ; /* index for A of first row in this column, or ColamdDead */ + IndexType start ; /* index for A of first row in this column, or Dead */ /* if column is dead */ IndexType length ; /* number of rows in this column */ union @@ -180,7 +186,7 @@ struct Colamd_Row /* Methods for row and column status update and checking. */ template bool row_is_marked_dead(const IndexType row_mark) { - return row_mark < ColamdAlive; + return row_mark < Alive; } template @@ -190,37 +196,37 @@ bool row_is_dead(const Colamd_Row* row, const IndexType r) { template bool row_is_alive(const Colamd_Row* row, const IndexType r) { - return row[r].shared2.mark >= ColamdAlive; + return row[r].shared2.mark >= Alive; } template void kill_row(Colamd_Row* row, const IndexType r) { - row[r].shared2.mark = ColamdDead; + row[r].shared2.mark = Dead; } template bool col_is_dead(const colamd_col* col, const IndexType c) { - return col[c].start < ColamdAlive; + return col[c].start < Alive; } template bool col_is_alive(const colamd_col* col, const IndexType c) { - return col[c].start >= ColamdAlive; + return col[c].start >= Alive; } template bool col_is_dead_principal(const colamd_col* col, const IndexType c) { - return col[c].start == ColamdDeadPrincipal; + return col[c].start == DeadPrincipal; } template void kill_principal_col(colamd_col* col, const IndexType c) { - col[c].start = ColamdDeadPrincipal; + col[c].start = DeadPrincipal; } template void kill_non_principal_col(colamd_col* col, const IndexType c) { - col[c].start = ColamdDeadNonPrincipal; + col[c].start = DeadNonPrincipal; } /* ========================================================================== */ @@ -308,12 +314,12 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType /** * \brief set default parameters The use of this routine is optional. * - * Colamd: rows with more than (knobs [ColamdDenseRow] * n_col) + * Colamd: rows with more than (knobs [DenseRow] * n_col) * entries are removed prior to ordering. Columns with more than - * (knobs [ColamdDenseCol] * n_row) entries are removed prior to + * (knobs [DenseCol] * n_row) entries are removed prior to * ordering, and placed last in the output column ordering. * - * ColamdDenseRow and ColamdDenseCol are defined as 0 and 1, + * DenseRow and DenseCol are defined as 0 and 1, * respectively, in colamd.h. Default values of these two knobs * are both 0.5. Currently, only knobs [0] and knobs [1] are * used, but future versions may use more knobs. If so, they will @@ -340,8 +346,8 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs]) { knobs [i] = 0 ; } - knobs [ColamdDenseRow] = 0.5 ; /* ignore rows over 50% dense */ - knobs [ColamdDenseCol] = 0.5 ; /* ignore columns over 50% dense */ + knobs [DenseRow] = 0.5 ; /* ignore rows over 50% dense */ + knobs [DenseCol] = 0.5 ; /* ignore columns over 50% dense */ } /** @@ -391,36 +397,36 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * { stats [i] = 0 ; } - stats [ColamdStatus] = ColamdOk ; - stats [ColamdInfo1] = -1 ; - stats [ColamdInfo2] = -1 ; + stats [Status] = Ok ; + stats [Info1] = -1 ; + stats [Info2] = -1 ; if (!A) /* A is not present */ { - stats [ColamdStatus] = ColamdErrorANotPresent ; + stats [Status] = ErrorANotPresent ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } if (!p) /* p is not present */ { - stats [ColamdStatus] = ColamdErrorPNotPresent ; + stats [Status] = ErrorPNotPresent ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } if (n_row < 0) /* n_row must be >= 0 */ { - stats [ColamdStatus] = ColamdErrorNrowNegative ; - stats [ColamdInfo1] = n_row ; + stats [Status] = ErrorNrowNegative ; + stats [Info1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } if (n_col < 0) /* n_col must be >= 0 */ { - stats [ColamdStatus] = ColamdErrorNcolNegative ; - stats [ColamdInfo1] = n_col ; + stats [Status] = ErrorNcolNegative ; + stats [Info1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } @@ -428,16 +434,16 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [ColamdStatus] = ColamdErrorNnzNegative ; - stats [ColamdInfo1] = nnz ; + stats [Status] = ErrorNnzNegative ; + stats [Info1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } if (p [0] != 0) { - stats [ColamdStatus] = ColamdErrorP0Nonzero ; - stats [ColamdInfo1] = p [0] ; + stats [Status] = ErrorP0Nonzero ; + stats [Info1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } @@ -459,9 +465,9 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [ColamdStatus] = ColamdErrorATooSmall ; - stats [ColamdInfo1] = need ; - stats [ColamdInfo2] = Alen ; + stats [Status] = ErrorATooSmall ; + stats [Info1] = need ; + stats [Info2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } @@ -495,9 +501,9 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * /* === Return statistics in stats ======================================= */ - stats [ColamdDenseRow] = n_row - n_row2 ; - stats [ColamdDenseCol] = n_col - n_col2 ; - stats [ColamdDefragCount] = ngarbage ; + stats [DenseRow] = n_row - n_row2 ; + stats [DenseCol] = n_col - n_col2 ; + stats [DefragCount] = ngarbage ; COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -555,24 +561,24 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [ColamdStatus] = ColamdErrorColLengthNegative ; - stats [ColamdInfo1] = col ; - stats [ColamdInfo2] = Col [col].length ; + stats [Status] = ErrorColLengthNegative ; + stats [Info1] = col ; + stats [Info2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } Col [col].shared1.thickness = 1 ; Col [col].shared2.score = 0 ; - Col [col].shared3.prev = ColamdEmpty ; - Col [col].shared4.degree_next = ColamdEmpty ; + Col [col].shared3.prev = Empty ; + Col [col].shared4.degree_next = Empty ; } /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ /* === Scan columns, compute row degrees, and check row indices ========= */ - stats [ColamdInfo3] = 0 ; /* number of duplicate or unsorted row indices*/ + stats [Info3] = 0 ; /* number of duplicate or unsorted row indices*/ for (row = 0 ; row < n_row ; row++) { @@ -594,10 +600,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [ColamdStatus] = ColamdErrorRowIndexOutOfBounds ; - stats [ColamdInfo1] = col ; - stats [ColamdInfo2] = row ; - stats [ColamdInfo3] = n_row ; + stats [Status] = ErrorRowIndexOutOfBounds ; + stats [Info1] = col ; + stats [Info2] = row ; + stats [Info3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -606,10 +612,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [ColamdStatus] = ColamdOkButJumbled ; - stats [ColamdInfo1] = col ; - stats [ColamdInfo2] = row ; - (stats [ColamdInfo3]) ++ ; + stats [Status] = OkButJumbled ; + stats [Info1] = col ; + stats [Info2] = row ; + (stats [Info3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -647,7 +653,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === Create row form ================================================== */ - if (stats [ColamdStatus] == ColamdOkButJumbled) + if (stats [Status] == OkButJumbled) { /* if cols jumbled, watch for repeated row indices */ for (col = 0 ; col < n_col ; col++) @@ -689,7 +695,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === See if we need to re-create columns ============================== */ - if (stats [ColamdStatus] == ColamdOkButJumbled) + if (stats [Status] == OkButJumbled) { COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; @@ -775,8 +781,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseRow] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseCol] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseRow] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseCol] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -913,7 +919,7 @@ static void init_scoring /* clear the hash buckets */ for (c = 0 ; c <= n_col ; c++) { - head [c] = ColamdEmpty ; + head [c] = Empty ; } min_score = n_col ; /* place in reverse order, so low column indices are at the front */ @@ -934,16 +940,16 @@ static void init_scoring COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (score >= 0) ; COLAMD_ASSERT (score <= n_col) ; - COLAMD_ASSERT (head [score] >= ColamdEmpty) ; + COLAMD_ASSERT (head [score] >= Empty) ; /* now add this column to dList at proper score location */ next_col = head [score] ; - Col [c].shared3.prev = ColamdEmpty ; + Col [c].shared3.prev = Empty ; Col [c].shared4.degree_next = next_col ; /* if there already was a column with the same score, set its */ /* previous pointer to this new column */ - if (next_col != ColamdEmpty) + if (next_col != Empty) { Col [next_col].shared3.prev = c ; } @@ -1044,10 +1050,10 @@ static IndexType find_ordering /* return the number of garbage collections */ /* make sure degree list isn't empty */ COLAMD_ASSERT (min_score >= 0) ; COLAMD_ASSERT (min_score <= n_col) ; - COLAMD_ASSERT (head [min_score] >= ColamdEmpty) ; + COLAMD_ASSERT (head [min_score] >= Empty) ; /* get pivot column from head of minimum degree list */ - while (min_score < n_col && head [min_score] == ColamdEmpty) + while (min_score < n_col && head [min_score] == Empty) { min_score++ ; } @@ -1055,9 +1061,9 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; next_col = Col [pivot_col].shared4.degree_next ; head [min_score] = next_col ; - if (next_col != ColamdEmpty) + if (next_col != Empty) { - Col [next_col].shared3.prev = ColamdEmpty ; + Col [next_col].shared3.prev = Empty ; } COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ; @@ -1163,7 +1169,7 @@ static IndexType find_ordering /* return the number of garbage collections */ else { /* there is no pivot row, since it is of zero length */ - pivot_row = ColamdEmpty ; + pivot_row = Empty ; COLAMD_ASSERT (pivot_row_length == 0) ; } COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; @@ -1215,8 +1221,8 @@ static IndexType find_ordering /* return the number of garbage collections */ next_col = Col [col].shared4.degree_next ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (cur_score >= ColamdEmpty) ; - if (prev_col == ColamdEmpty) + COLAMD_ASSERT (cur_score >= Empty) ; + if (prev_col == Empty) { head [cur_score] = next_col ; } @@ -1224,7 +1230,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { Col [prev_col].shared4.degree_next = next_col ; } - if (next_col != ColamdEmpty) + if (next_col != Empty) { Col [next_col].shared3.prev = prev_col ; } @@ -1345,7 +1351,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (hash <= n_col) ; head_column = head [hash] ; - if (head_column > ColamdEmpty) + if (head_column > Empty) { /* degree list "hash" is non-empty, use prev (shared3) of */ /* first column in degree list as head of hash bucket */ @@ -1434,11 +1440,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (head [cur_score] >= ColamdEmpty) ; + COLAMD_ASSERT (head [cur_score] >= Empty) ; next_col = head [cur_score] ; Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = ColamdEmpty ; - if (next_col != ColamdEmpty) + Col [col].shared3.prev = Empty ; + if (next_col != Empty) { Col [next_col].shared3.prev = col ; } @@ -1508,7 +1514,7 @@ static inline void order_children { /* find an un-ordered non-principal column */ COLAMD_ASSERT (col_is_dead(Col, i)) ; - if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == ColamdEmpty) + if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == Empty) { parent = i ; /* once found, find its principal parent */ @@ -1525,7 +1531,7 @@ static inline void order_children do { - COLAMD_ASSERT (Col [c].shared2.order == ColamdEmpty) ; + COLAMD_ASSERT (Col [c].shared2.order == Empty) ; /* order this column */ Col [c].shared2.order = order++ ; @@ -1538,7 +1544,7 @@ static inline void order_children /* continue until we hit an ordered column. There are */ /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ - } while (Col [c].shared2.order == ColamdEmpty) ; + } while (Col [c].shared2.order == Empty) ; /* re-order the super_col parent to largest order for this group */ Col [parent].shared2.order = order ; @@ -1633,7 +1639,7 @@ static void detect_super_cols /* === Get the first column in this hash bucket ===================== */ head_column = head [hash] ; - if (head_column > ColamdEmpty) + if (head_column > Empty) { first_col = Col [head_column].shared3.headhash ; } @@ -1644,7 +1650,7 @@ static void detect_super_cols /* === Consider each column in the hash bucket ====================== */ - for (super_c = first_col ; super_c != ColamdEmpty ; + for (super_c = first_col ; super_c != Empty ; super_c = Col [super_c].shared4.hash_next) { COLAMD_ASSERT (col_is_alive(Col, super_c)) ; @@ -1657,7 +1663,7 @@ static void detect_super_cols /* === Compare super_c with all columns after it ================ */ for (c = Col [super_c].shared4.hash_next ; - c != ColamdEmpty ; c = Col [c].shared4.hash_next) + c != Empty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; COLAMD_ASSERT (col_is_alive(Col, c)) ; @@ -1703,7 +1709,7 @@ static void detect_super_cols Col [c].shared1.parent = super_c ; kill_non_principal_col(Col, c) ; /* order c later, in order_children() */ - Col [c].shared2.order = ColamdEmpty ; + Col [c].shared2.order = Empty ; /* remove c from hash bucket */ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; } @@ -1711,15 +1717,15 @@ static void detect_super_cols /* === Empty this hash bucket ======================================= */ - if (head_column > ColamdEmpty) + if (head_column > Empty) { /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = ColamdEmpty ; + Col [head_column].shared3.headhash = Empty ; } else { /* corresponding degree list "hash" is empty */ - head [hash] = ColamdEmpty ; + head [hash] = Empty ; } } } From 15f3d9d2720f060100b2559058c9383b6ffa4d3e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Sep 2019 00:50:51 +0200 Subject: [PATCH 26/30] More colamd cleanup: - Move colamd implementation in its own namespace to avoid polluting the internal namespace with Ok, Status, etc. - Fix signed/unsigned warning - move some ugly free functions as member functions --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 335 +++++++++++------------ Eigen/src/OrderingMethods/Ordering.h | 10 +- 2 files changed, 158 insertions(+), 187 deletions(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 3c9e85aa8..8e339a704 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -47,25 +47,26 @@ #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H -/* ========================================================================== */ -/* === Knob and statistics definitions used elsewhere ======================= */ -/* ========================================================================== */ - -/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -const size_t ColamdKnobs = 20; - -/* number of output statistics. Only stats [0..6] are currently used. */ -const size_t ColamdStats = 20; - namespace internal { + +namespace Colamd { + /* Ensure that debugging is turned off: */ #ifndef COLAMD_NDEBUG #define COLAMD_NDEBUG #endif /* NDEBUG */ + + /* ========================================================================== */ /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ +/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ +const int NKnobs = 20; + +/* number of output statistics. Only stats [0..6] are currently used. */ +const int NStats = 20; + /* Indices into knobs and stats array. */ enum KnobsStatsIndex { /* knobs [0] and stats [0]: dense row knob and output statistic. */ @@ -132,7 +133,7 @@ enum ColumnStatus { // == Row and Column structures == template -struct colamd_col +struct ColStructure { IndexType start ; /* index for A of first row in this column, or Dead */ /* if column is dead */ @@ -163,10 +164,20 @@ struct colamd_col IndexType hash_next ; /* next column, if col is in a hash list */ } shared4 ; + inline bool is_dead() const { return start < Alive; } + + inline bool is_alive() const { return start >= Alive; } + + inline bool is_dead_principal() const { return start == DeadPrincipal; } + + inline void kill_principal() { start = DeadPrincipal; } + + inline void kill_non_principal() { start = DeadNonPrincipal; } + }; template -struct Colamd_Row +struct RowStructure { IndexType start ; /* index for A of first col in this row */ IndexType length ; /* number of principal columns in this row */ @@ -181,54 +192,14 @@ struct Colamd_Row IndexType first_column ;/* first column in row (used in garbage collection) */ } shared2 ; + inline bool is_dead() const { return shared2.mark < Alive; } + + inline bool is_alive() const { return shared2.mark >= Alive; } + + inline void kill() { shared2.mark = Dead; } + }; -/* Methods for row and column status update and checking. */ -template -bool row_is_marked_dead(const IndexType row_mark) { - return row_mark < Alive; -} - -template -bool row_is_dead(const Colamd_Row* row, const IndexType r) { - return row_is_marked_dead(row[r].shared2.mark); -} - -template -bool row_is_alive(const Colamd_Row* row, const IndexType r) { - return row[r].shared2.mark >= Alive; -} - -template -void kill_row(Colamd_Row* row, const IndexType r) { - row[r].shared2.mark = Dead; -} - -template -bool col_is_dead(const colamd_col* col, const IndexType c) { - return col[c].start < Alive; -} - -template -bool col_is_alive(const colamd_col* col, const IndexType c) { - return col[c].start >= Alive; -} - -template -bool col_is_dead_principal(const colamd_col* col, const IndexType c) { - return col[c].start == DeadPrincipal; -} - -template -void kill_principal_col(colamd_col* col, const IndexType c) { - col[c].start = DeadPrincipal; -} - -template -void kill_non_principal_col(colamd_col* col, const IndexType c) { - col[c].start = DeadNonPrincipal; -} - /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ @@ -249,33 +220,33 @@ void kill_non_principal_col(colamd_col* col, const IndexType c) { */ template inline IndexType colamd_c(IndexType n_col) -{ return IndexType( ((n_col) + 1) * sizeof (colamd_col) / sizeof (IndexType) ) ; } +{ return IndexType( ((n_col) + 1) * sizeof (ColStructure) / sizeof (IndexType) ) ; } template inline IndexType colamd_r(IndexType n_row) -{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row) / sizeof (IndexType)); } +{ return IndexType(((n_row) + 1) * sizeof (RowStructure) / sizeof (IndexType)); } // Prototypes of non-user callable routines template -static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[ColamdStats] ); +static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure col [], IndexType A [], IndexType p [], IndexType stats[NStats] ); template -static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[ColamdKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); +static void init_scoring (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); template -static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); +static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); template -static void order_children (IndexType n_col, colamd_col Col [], IndexType p []); +static void order_children (IndexType n_col, ColStructure Col [], IndexType p []); template -static void detect_super_cols (colamd_col Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; +static void detect_super_cols (ColStructure Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; template -static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType *pfree) ; +static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType *pfree) ; template -static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row [] ) ; +static inline IndexType clear_mark (IndexType n_row, RowStructure Row [] ) ; /* === No debugging ========================================================= */ @@ -303,7 +274,7 @@ static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row * \return recommended value of Alen for use by colamd */ template -inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col) +inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col) { if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0) return (-1); @@ -332,7 +303,7 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType * \param knobs parameter settings for colamd */ -static inline void colamd_set_defaults(double knobs[ColamdKnobs]) +static inline void set_defaults(double knobs[NKnobs]) { /* === Local variables ================================================== */ @@ -342,12 +313,12 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs]) { return ; /* no knobs to initialize */ } - for (i = 0 ; i < ColamdKnobs ; i++) + for (i = 0 ; i < NKnobs ; i++) { knobs [i] = 0 ; } - knobs [DenseRow] = 0.5 ; /* ignore rows over 50% dense */ - knobs [DenseCol] = 0.5 ; /* ignore columns over 50% dense */ + knobs [Colamd::DenseRow] = 0.5 ; /* ignore rows over 50% dense */ + knobs [Colamd::DenseCol] = 0.5 ; /* ignore columns over 50% dense */ } /** @@ -368,7 +339,7 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs]) * \param stats colamd output statistics and error codes */ template -static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[ColamdKnobs], IndexType stats[ColamdStats]) +static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats]) { /* === Local variables ================================================== */ @@ -377,13 +348,13 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * IndexType Row_size ; /* size of Row [], in integers */ IndexType Col_size ; /* size of Col [], in integers */ IndexType need ; /* minimum required length of A */ - Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ - colamd_col *Col ; /* pointer into A of Col [0..n_col] array */ + Colamd::RowStructure *Row ; /* pointer into A of Row [0..n_row] array */ + Colamd::ColStructure *Col ; /* pointer into A of Col [0..n_col] array */ IndexType n_col2 ; /* number of non-dense, non-empty columns */ IndexType n_row2 ; /* number of non-dense, non-empty rows */ IndexType ngarbage ; /* number of garbage collections performed */ IndexType max_deg ; /* maximum row degree */ - double default_knobs [ColamdKnobs] ; /* default knobs array */ + double default_knobs [NKnobs] ; /* default knobs array */ /* === Check the input arguments ======================================== */ @@ -393,40 +364,40 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * COLAMD_DEBUG0 (("colamd: stats not present\n")) ; return (false) ; } - for (i = 0 ; i < ColamdStats ; i++) + for (i = 0 ; i < NStats ; i++) { stats [i] = 0 ; } - stats [Status] = Ok ; - stats [Info1] = -1 ; - stats [Info2] = -1 ; + stats [Colamd::Status] = Colamd::Ok ; + stats [Colamd::Info1] = -1 ; + stats [Colamd::Info2] = -1 ; if (!A) /* A is not present */ { - stats [Status] = ErrorANotPresent ; + stats [Colamd::Status] = Colamd::ErrorANotPresent ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } if (!p) /* p is not present */ { - stats [Status] = ErrorPNotPresent ; + stats [Colamd::Status] = Colamd::ErrorPNotPresent ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } if (n_row < 0) /* n_row must be >= 0 */ { - stats [Status] = ErrorNrowNegative ; - stats [Info1] = n_row ; + stats [Colamd::Status] = Colamd::ErrorNrowNegative ; + stats [Colamd::Info1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } if (n_col < 0) /* n_col must be >= 0 */ { - stats [Status] = ErrorNcolNegative ; - stats [Info1] = n_col ; + stats [Colamd::Status] = Colamd::ErrorNcolNegative ; + stats [Colamd::Info1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } @@ -434,16 +405,16 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [Status] = ErrorNnzNegative ; - stats [Info1] = nnz ; + stats [Colamd::Status] = Colamd::ErrorNnzNegative ; + stats [Colamd::Info1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } if (p [0] != 0) { - stats [Status] = ErrorP0Nonzero ; - stats [Info1] = p [0] ; + stats [Colamd::Status] = Colamd::ErrorP0Nonzero ; + stats [Colamd::Info1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } @@ -452,7 +423,7 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * if (!knobs) { - colamd_set_defaults (default_knobs) ; + set_defaults (default_knobs) ; knobs = default_knobs ; } @@ -465,20 +436,20 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [Status] = ErrorATooSmall ; - stats [Info1] = need ; - stats [Info2] = Alen ; + stats [Colamd::Status] = Colamd::ErrorATooSmall ; + stats [Colamd::Info1] = need ; + stats [Colamd::Info2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } Alen -= Col_size + Row_size ; - Col = (colamd_col *) &A [Alen] ; - Row = (Colamd_Row *) &A [Alen + Col_size] ; + Col = (ColStructure *) &A [Alen] ; + Row = (RowStructure *) &A [Alen + Col_size] ; /* === Construct the row and column data structures ===================== */ - if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) + if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) { /* input matrix is invalid */ COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ; @@ -487,23 +458,23 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * /* === Initialize scores, kill dense rows/columns ======================= */ - Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs, + Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg) ; /* === Order the supercolumns =========================================== */ - ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p, + ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2*nnz) ; /* === Order the non-principal columns ================================== */ - Eigen::internal::order_children (n_col, Col, p) ; + Colamd::order_children (n_col, Col, p) ; /* === Return statistics in stats ======================================= */ - stats [DenseRow] = n_row - n_row2 ; - stats [DenseCol] = n_col - n_col2 ; - stats [DefragCount] = ngarbage ; + stats [Colamd::DenseRow] = n_row - n_row2 ; + stats [Colamd::DenseCol] = n_col - n_col2 ; + stats [Colamd::DefragCount] = ngarbage ; COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -514,7 +485,6 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * /* There are no user-callable routines beyond this point in the file */ - /* ========================================================================== */ /* === init_rows_cols ======================================================= */ /* ========================================================================== */ @@ -534,11 +504,11 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A, of size Alen */ IndexType p [], /* pointers to columns in A, of size n_col+1 */ - IndexType stats [ColamdStats] /* colamd statistics */ + IndexType stats [NStats] /* colamd statistics */ ) { /* === Local variables ================================================== */ @@ -561,9 +531,9 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [Status] = ErrorColLengthNegative ; - stats [Info1] = col ; - stats [Info2] = Col [col].length ; + stats [Colamd::Status] = Colamd::ErrorColLengthNegative ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } @@ -600,10 +570,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [Status] = ErrorRowIndexOutOfBounds ; - stats [Info1] = col ; - stats [Info2] = row ; - stats [Info3] = n_row ; + stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + stats [Colamd::Info3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -612,10 +582,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [Status] = OkButJumbled ; - stats [Info1] = col ; - stats [Info2] = row ; - (stats [Info3]) ++ ; + stats [Colamd::Status] = Colamd::OkButJumbled ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + (stats [Colamd::Info3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -750,11 +720,11 @@ static void init_scoring IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ - double knobs [ColamdKnobs],/* parameters */ + double knobs [NKnobs],/* parameters */ IndexType *p_n_row2, /* number of non-dense, non-empty rows */ IndexType *p_n_col2, /* number of non-dense, non-empty columns */ IndexType *p_max_deg /* maximum row degree */ @@ -781,8 +751,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseRow] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseCol] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -799,7 +769,7 @@ static void init_scoring { /* this is a empty column, kill and order it last */ Col [c].shared2.order = --n_col2 ; - kill_principal_col(Col, c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; @@ -810,7 +780,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip any dead columns */ - if (col_is_dead(Col, c)) + if (Col[c].is_dead()) { continue ; } @@ -826,7 +796,7 @@ static void init_scoring { Row [*cp++].shared1.degree-- ; } - kill_principal_col(Col, c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; @@ -840,7 +810,7 @@ static void init_scoring if (deg > dense_row_count || deg == 0) { /* kill a dense or empty row */ - kill_row(Row, r) ; + Row[r].kill() ; --n_row2 ; } else @@ -862,7 +832,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip dead column */ - if (col_is_dead(Col, c)) + if (Col[c].is_dead()) { continue ; } @@ -875,7 +845,7 @@ static void init_scoring /* get a row */ row = *cp++ ; /* skip if dead */ - if (row_is_dead(Row, row)) + if (Row[row].is_dead()) { continue ; } @@ -894,7 +864,7 @@ static void init_scoring /* and have already been killed) */ COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ; Col [c].shared2.order = --n_col2 ; - kill_principal_col(Col, c) ; + Col[c].kill_principal() ; } else { @@ -927,7 +897,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* only add principal columns to degree lists */ - if (col_is_alive(Col, c)) + if (Col[c].is_alive()) { COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n", c, Col [c].shared2.score, min_score, n_col)) ; @@ -988,8 +958,8 @@ static IndexType find_ordering /* return the number of garbage collections */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ IndexType Alen, /* size of A, 2*nnz + n_col or larger */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ IndexType n_col2, /* Remaining columns to order */ @@ -1035,7 +1005,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Initialization and clear mark ==================================== */ max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; min_score = 0 ; ngarbage = 0 ; COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; @@ -1066,7 +1036,7 @@ static IndexType find_ordering /* return the number of garbage collections */ Col [next_col].shared3.prev = Empty ; } - COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ; + COLAMD_ASSERT (Col[pivot_col].is_alive()) ; COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ; /* remember score for defrag check */ @@ -1085,12 +1055,12 @@ static IndexType find_ordering /* return the number of garbage collections */ needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { - pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; ngarbage++ ; /* after garbage collection we will have enough */ COLAMD_ASSERT (pfree + needed_memory < Alen) ; /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } @@ -1113,9 +1083,9 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", row_is_alive(Row, row), row)) ; + COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ; /* skip if row is dead */ - if (row_is_dead(Row, row)) + if (Row[row].is_dead()) { continue ; } @@ -1127,7 +1097,7 @@ static IndexType find_ordering /* return the number of garbage collections */ col = *rp++ ; /* add the column, if alive and untagged */ col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && col_is_alive(Col, col)) + if (col_thickness > 0 && Col[col].is_alive()) { /* tag column in pivot row */ Col [col].shared1.thickness = -col_thickness ; @@ -1154,7 +1124,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* may be killing an already dead row */ row = *cp++ ; COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - kill_row(Row, row) ; + Row[row].kill() ; } /* === Select a row index to use as the new pivot row =============== */ @@ -1206,7 +1176,7 @@ static IndexType find_ordering /* return the number of garbage collections */ while (rp < rp_end) { col = *rp++ ; - COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; COLAMD_DEBUG3 (("Col: %d\n", col)) ; /* clear tags used to construct pivot row pattern */ @@ -1243,12 +1213,12 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (row_is_marked_dead (row_mark)) + if (Row[row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row != pivot_row) ; set_difference = row_mark - tag_mark ; /* check if the row has been seen yet */ @@ -1264,7 +1234,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (set_difference == 0) { COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - kill_row(Row, row) ; + Row[row].kill() ; } else { @@ -1286,7 +1256,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a column */ col = *rp++ ; - COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; hash = 0 ; cur_score = 0 ; cp = &A [Col [col].start] ; @@ -1301,12 +1271,12 @@ static IndexType find_ordering /* return the number of garbage collections */ /* get a row */ row = *cp++ ; COLAMD_ASSERT(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (row_is_marked_dead (row_mark)) + if (Row [row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row_mark > tag_mark) ; /* compact the column */ *new_cp++ = row ; @@ -1327,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ; /* nothing left but the pivot row in this column */ - kill_principal_col(Col, col) ; + Col[col].kill_principal() ; pivot_row_degree -= Col [col].shared1.thickness ; COLAMD_ASSERT (pivot_row_degree >= 0) ; /* order it */ @@ -1368,7 +1338,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* save hash function in Col [col].shared3.hash */ Col [col].shared3.hash = (IndexType) hash ; - COLAMD_ASSERT (col_is_alive(Col, col)) ; + COLAMD_ASSERT (Col[col].is_alive()) ; } } @@ -1378,11 +1348,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ; - Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; + Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; /* === Kill the pivotal column ====================================== */ - kill_principal_col(Col, pivot_col) ; + Col[pivot_col].kill_principal() ; /* === Clear mark =================================================== */ @@ -1390,7 +1360,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (tag_mark >= max_mark) { COLAMD_DEBUG2 (("clearing tag_mark\n")) ; - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } /* === Finalize the new pivot row, and column scores ================ */ @@ -1406,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { col = *rp++ ; /* skip dead columns */ - if (col_is_dead(Col, col)) + if (Col[col].is_dead()) { continue ; } @@ -1497,7 +1467,7 @@ static inline void order_children /* === Parameters ======================================================= */ IndexType n_col, /* number of columns of A */ - colamd_col Col [], /* of size n_col+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType p [] /* p [0 ... n_col-1] is the column permutation*/ ) { @@ -1514,14 +1484,14 @@ static inline void order_children { /* find an un-ordered non-principal column */ COLAMD_ASSERT (col_is_dead(Col, i)) ; - if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == Empty) + if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty) { parent = i ; /* once found, find its principal parent */ do { parent = Col [parent].shared1.parent ; - } while (!col_is_dead_principal(Col, parent)) ; + } while (!Col[parent].is_dead_principal()) ; /* now, order all un-ordered non-principal columns along path */ /* to this parent. collapse tree at the same time */ @@ -1597,7 +1567,7 @@ static void detect_super_cols ( /* === Parameters ======================================================= */ - colamd_col Col [], /* of size n_col+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A */ IndexType head [], /* head of degree lists and hash buckets */ IndexType row_start, /* pointer to set of columns to check */ @@ -1627,7 +1597,7 @@ static void detect_super_cols while (rp < rp_end) { col = *rp++ ; - if (col_is_dead(Col, col)) + if (Col[col].is_dead()) { continue ; } @@ -1653,7 +1623,7 @@ static void detect_super_cols for (super_c = first_col ; super_c != Empty ; super_c = Col [super_c].shared4.hash_next) { - COLAMD_ASSERT (col_is_alive(Col, super_c)) ; + COLAMD_ASSERT (Col [super_c].is_alive()) ; COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ; length = Col [super_c].length ; @@ -1666,7 +1636,7 @@ static void detect_super_cols c != Empty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; - COLAMD_ASSERT (col_is_alive(Col, c)) ; + COLAMD_ASSERT (Col[c].is_alive()) ; COLAMD_ASSERT (Col [c].shared3.hash == hash) ; /* not identical if lengths or scores are different */ @@ -1684,8 +1654,8 @@ static void detect_super_cols for (i = 0 ; i < length ; i++) { /* the columns are "clean" (no dead rows) */ - COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; - COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; + COLAMD_ASSERT ( cp1->is_alive() ); + COLAMD_ASSERT ( cp2->is_alive() ); /* row indices will same order for both supercols, */ /* no gather scatter necessary */ if (*cp1++ != *cp2++) @@ -1707,7 +1677,7 @@ static void detect_super_cols Col [super_c].shared1.thickness += Col [c].shared1.thickness ; Col [c].shared1.parent = super_c ; - kill_non_principal_col(Col, c) ; + Col[c].kill_non_principal() ; /* order c later, in order_children() */ Col [c].shared2.order = Empty ; /* remove c from hash bucket */ @@ -1750,8 +1720,8 @@ static IndexType garbage_collection /* returns the new value of pfree */ IndexType n_row, /* number of rows */ IndexType n_col, /* number of columns */ - Colamd_Row Row [], /* row info */ - colamd_col Col [], /* column info */ + RowStructure Row [], /* row info */ + ColStructure Col [], /* column info */ IndexType A [], /* A [0 ... Alen-1] holds the matrix */ IndexType *pfree /* &A [0] ... pfree is in use */ ) @@ -1770,7 +1740,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ pdest = &A[0] ; for (c = 0 ; c < n_col ; c++) { - if (col_is_alive(Col, c)) + if (Col[c].is_alive()) { psrc = &A [Col [c].start] ; @@ -1781,7 +1751,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { r = *psrc++ ; - if (row_is_alive(Row, r)) + if (Row[r].is_alive()) { *pdest++ = r ; } @@ -1794,22 +1764,22 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (r = 0 ; r < n_row ; r++) { - if (row_is_alive(Row, r)) + if (Row[r].is_alive()) { if (Row [r].length == 0) { - /* this row is of zero length. cannot compact it, so kill it */ - COLAMD_DEBUG3 (("Defrag row kill\n")) ; - kill_row(Row, r) ; + /* this row is of zero length. cannot compact it, so kill it */ + COLAMD_DEBUG3 (("Defrag row kill\n")) ; + Row[r].kill() ; } else { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - COLAMD_ASSERT (row_is_alive(Row, r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ones_complement(r) ; + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + COLAMD_ASSERT (Row[r].is_alive()) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ones_complement(r) ; } } @@ -1829,7 +1799,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ COLAMD_ASSERT (r >= 0 && r < n_row) ; /* restore first column index */ *psrc = Row [r].shared2.first_column ; - COLAMD_ASSERT (row_is_alive(Row, r)) ; + COLAMD_ASSERT (Row[r].is_alive()) ; /* move and compact the row */ COLAMD_ASSERT (pdest <= psrc) ; @@ -1838,7 +1808,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { c = *psrc++ ; - if (col_is_alive(Col, c)) + if (Col[c].is_alive()) { *pdest++ = c ; } @@ -1870,7 +1840,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ /* === Parameters ======================================================= */ IndexType n_row, /* number of rows in A */ - Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ + RowStructure Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ ) { /* === Local variables ================================================== */ @@ -1879,7 +1849,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ for (r = 0 ; r < n_row ; r++) { - if (row_is_alive(Row, r)) + if (Row[r].is_alive()) { Row [r].shared2.mark = 0 ; } @@ -1887,6 +1857,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ return (1) ; } +} // namespace Colamd } // namespace internal #endif diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 10ba6b464..c57897014 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -129,17 +129,17 @@ class COLAMDOrdering StorageIndex n = StorageIndex(mat.cols()); StorageIndex nnz = StorageIndex(mat.nonZeros()); // Get the recommended value of Alen to be used by colamd - StorageIndex Alen = internal::colamd_recommended(nnz, m, n); + StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); // Set the default parameters - double knobs [ColamdKnobs]; - StorageIndex stats [ColamdStats]; - internal::colamd_set_defaults(knobs); + double knobs [internal::Colamd::NKnobs]; + StorageIndex stats [internal::Colamd::NStats]; + internal::Colamd::set_defaults(knobs); IndexVector p(n+1), A(Alen); for(StorageIndex i=0; i <= n; i++) p(i) = mat.outerIndexPtr()[i]; for(StorageIndex i=0; i < nnz; i++) A(i) = mat.innerIndexPtr()[i]; // Call Colamd routine to compute the ordering - StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); + StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); EIGEN_UNUSED_VARIABLE(info); eigen_assert( info && "COLAMD failed " ); From c694be1214d99c3cc0431c719c110d10cf64a7ec Mon Sep 17 00:00:00 2001 From: Alberto Luaces Date: Tue, 23 Jul 2019 09:24:06 +0000 Subject: [PATCH 27/30] Fixed Tensor documentation formatting. --- unsupported/Eigen/CXX11/src/Tensor/README.md | 158 ++++++++++--------- 1 file changed, 80 insertions(+), 78 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 006f35b23..1f4cf272b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1630,81 +1630,81 @@ dimension in RowMajor layout. For example, given the following input tensor: - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); - cout << "tensor: " << endl << tensor << endl; -=> -tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 + cout << "tensor: " << endl << tensor << endl; + => + tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 Six 2x2 patches can be extracted and indexed using the following code: - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; } - cout << endl; } - } This code results in the following output when the data layout is ColMajor: -patch index: 0 -0 1 -4 5 -patch index: 1 -4 5 -8 9 -patch index: 2 -1 2 -5 6 -patch index: 3 -5 6 -9 10 -patch index: 4 -2 3 -6 7 -patch index: 5 -6 7 -10 11 + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 4 5 + 8 9 + patch index: 2 + 1 2 + 5 6 + patch index: 3 + 5 6 + 9 10 + patch index: 4 + 2 3 + 6 7 + patch index: 5 + 6 7 + 10 11 This code results in the following output when the data layout is RowMajor: (NOTE: the set of patches is the same as in ColMajor, but are indexed differently). -patch index: 0 -0 1 -4 5 -patch index: 1 -1 2 -5 6 -patch index: 2 -2 3 -6 7 -patch index: 3 -4 5 -8 9 -patch index: 4 -5 6 -9 10 -patch index: 5 -6 7 -10 11 + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 1 2 + 5 6 + patch index: 2 + 2 3 + 6 7 + patch index: 3 + 4 5 + 8 9 + patch index: 4 + 5 6 + 9 10 + patch index: 5 + 6 7 + 10 11 ### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` @@ -1736,28 +1736,30 @@ sizes: *) columns: 5 *) batch: 7 - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); 2x2 image patches can be extracted and indexed using the following code: *) 2D patch: ColMajor (patch indexed by second-to-last dimension) - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 *) 2D patch: RowMajor (patch indexed by the second dimension) - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 + + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 ## Special Operations From 8e7e3d9bc85152654ab27fbbaecb3ea1397c3ae7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Sep 2019 13:09:03 +0200 Subject: [PATCH 28/30] Makes Scalar/RealScalar typedefs public in Pardiso's wrappers (see PR 688) --- Eigen/src/PardisoSupport/PardisoSupport.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 07006b5c4..f89b79bd5 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -386,14 +386,15 @@ class PardisoLU : public PardisoImpl< PardisoLU > { protected: typedef PardisoImpl Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLU >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; + using Base::compute; using Base::solve; @@ -441,14 +442,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > { protected: typedef PardisoImpl< PardisoLLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; enum { UpLo = _UpLo }; using Base::compute; @@ -504,14 +505,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT > { protected: typedef PardisoImpl< PardisoLDLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLDLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; using Base::compute; enum { UpLo = Options&(Upper|Lower) }; From f68f2bba09b556d98e314600676304193e60cfcb Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 3 Sep 2019 11:08:09 -0700 Subject: [PATCH 29/30] TensorMap constness should not change underlying storage constness --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index b28cd822f..172a6bab8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -136,10 +136,10 @@ template class MakePoin EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerConstType data() const { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -152,14 +152,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const + EIGEN_STRONG_INLINE StorageRefType operator()() const { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -167,7 +167,7 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); @@ -181,7 +181,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -192,7 +192,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -203,7 +203,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -214,7 +214,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); From a8d264fa9c56e42f77e2129d4e504f5c854821c2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 3 Sep 2019 11:38:39 -0700 Subject: [PATCH 30/30] Add test for const TensorMap underlying data mutation --- unsupported/test/cxx11_tensor_map.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index dc8532f5c..4d4f68911 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -288,6 +288,30 @@ static void test_0d_const_tensor() VERIFY_IS_EQUAL(scalar4(), 13); } +static void test_0d_const_tensor_map() +{ + Tensor scalar1; + Tensor scalar2; + + const TensorMap > scalar3(scalar1.data()); + const TensorMap > scalar4(scalar2.data()); + + // Although TensorMap is constant, we still can write to the underlying + // storage, because we map over non-constant Tensor. + scalar3() = 7; + scalar4() = 13; + + VERIFY_IS_EQUAL(scalar1(), 7); + VERIFY_IS_EQUAL(scalar2(), 13); + + // Pointer to the underlying storage is also non-const. + scalar3.data()[0] = 8; + scalar4.data()[0] = 14; + + VERIFY_IS_EQUAL(scalar1(), 8); + VERIFY_IS_EQUAL(scalar2(), 14); +} + EIGEN_DECLARE_TEST(cxx11_tensor_map) { CALL_SUBTEST(test_0d()); @@ -299,4 +323,5 @@ EIGEN_DECLARE_TEST(cxx11_tensor_map) CALL_SUBTEST(test_casting()); CALL_SUBTEST(test_0d_const_tensor()); + CALL_SUBTEST(test_0d_const_tensor_map()); }