diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bcceaa7b..95c4e4027 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -216,11 +216,11 @@ pandnot(const Packet& a, const Packet& b) { return a & (~b); } /** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} +ptrue(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { return pxor(pones(a), a);} +pnot(const Packet& a) { return pxor(ptrue(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -258,15 +258,15 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 9f1bb969e..dcca35279 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -75,7 +75,7 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } -template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } @@ -290,7 +290,7 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } -template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index f6a514fbf..c18c18cc3 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -251,22 +251,34 @@ template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { re template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } +template<> EIGEN_STRONG_INLINE Packet8i ptrue(const Packet8i& a) { #ifdef EIGEN_VECTORIZE_AVX2 -template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { - return _mm256_cmpeq_epi64(a,a); -} + // vpcmpeqd has lower latency than the more general vcmpps + return _mm256_cmpeq_epi32(a,a); #else -template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { - const unsigned int o = 0xffffffffu; - return _mm256_set_epi32(o, o, o, o, o, o, o, o); -} + const __m256 b = _mm256_castsi256_ps(a); + return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); #endif -template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { - return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); } -template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { - return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +template<> EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + const __m256i b = _mm256_castps_si256(a); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); +#else + return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4d ptrue(const Packet4d& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqq has lower latency than the more general vcmppd + const __m256i b = _mm256_castpd_si256(a); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); +#else + return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); +#endif } template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 154fedc25..b7e68d2ab 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -55,6 +55,8 @@ template<> struct unpacket_traits { typedef Packet4cf half; }; +template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(a.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) @@ -268,6 +270,8 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, con return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); } +template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9666c4e22..1164f24b1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -321,19 +321,18 @@ EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { } template <> -EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { - const unsigned int o = 0xffffffffu; - return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +EIGEN_STRONG_INLINE Packet16i ptrue(const Packet16i& /*a*/) { + return _mm512_set1_epi32(0xffffffffu); } template <> -EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { - return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) { + return _mm512_castsi512_ps(ptrue(_mm512_castps_si512(a))); } template <> -EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { - return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); +EIGEN_STRONG_INLINE Packet8d ptrue(const Packet8d& a) { + return _mm512_castsi512_pd(ptrue(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index c4dfedcf8..00e40d40b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,7 +143,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; } @@ -648,8 +648,8 @@ template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; } -template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { - Packet16h r; r.x = Packet8i(pones(a.x)); return r; +template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { + Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { @@ -1097,7 +1097,7 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } -template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index fa84097ac..c3b1de5ce 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,7 +82,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } @@ -308,7 +308,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6dd2f8a46..ebc540e24 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,14 +378,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } template<> EIGEN_STRONG_INLINE Packet4f -pones(const Packet4f& a) { +ptrue(const Packet4f& a) { Packet4i b = _mm_castps_si128(a); return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d -pones(const Packet2d& a) { +ptrue(const Packet2d& a) { Packet4i b = _mm_castpd_si128(a); return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 460cfbdbe..04f93108f 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -240,7 +240,7 @@ template void packetmath() CHECK_CWISE1(internal::pnot, internal::pnot); CHECK_CWISE1(internal::pzero, internal::pzero); - CHECK_CWISE1(internal::pones, internal::pones); + CHECK_CWISE1(internal::ptrue, internal::ptrue); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj);