diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 320479bb9..888f51d04 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -16,29 +16,7 @@ namespace Eigen { namespace internal { -// For now we use SSE to handle integers, so we can't use AVX instructions to cast -// from int to float -template <> -struct type_casting_traits { - enum { - VectorizedCast = 0, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 0, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - - #ifndef EIGEN_VECTORIZE_AVX512 - template <> struct type_casting_traits { enum { @@ -76,8 +54,17 @@ struct type_casting_traits { }; }; +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; #endif // EIGEN_VECTORIZE_AVX512 + template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { return _mm256_cvttps_epi32(a); } @@ -86,6 +73,43 @@ template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i return _mm256_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet4d& a, const Packet4d& b) { + return _mm256_set_m128(_mm256_cvtpd_ps(a), _mm256_cvtpd_ps(b)); +} + +template <> +EIGEN_STRONG_INLINE Packet16b pcast(const Packet8f& a, + const Packet8f& b) { + __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ); + __m256 nonzero_b = _mm256_cmp_ps(b, pzero(b), _CMP_NEQ_UQ); + constexpr char kFF = '\255'; +#ifndef EIGEN_VECTORIZE_AVX2 + __m128i shuffle_mask128_a_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0); + __m128i shuffle_mask128_a_hi = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF); + __m128i shuffle_mask128_b_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF); + __m128i shuffle_mask128_b_hi = _mm_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF); + __m128i a_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 1), shuffle_mask128_a_hi); + __m128i a_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 0), shuffle_mask128_a_lo); + __m128i b_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 1), shuffle_mask128_b_hi); + __m128i b_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 0), shuffle_mask128_b_lo); + + #else + + __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, + kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0); + __m256i b_shuffle_mask = _mm256_set_epi8( 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, + kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF); + __m256i a_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_a), a_shuffle_mask); + __m256i b_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_b), b_shuffle_mask); + __m128i a_hi = _mm256_extractf128_si256(a_shuff, 1); + __m128i a_lo = _mm256_extractf128_si256(a_shuff, 0); + __m128i b_hi = _mm256_extractf128_si256(b_shuff, 1); + __m128i b_lo = _mm256_extractf128_si256(b_shuff, 0); +#endif + __m128i merged = _mm_or_si128(_mm_or_si128(b_lo, b_hi), _mm_or_si128(a_lo, a_hi)); + return _mm_and_si128(merged, _mm_set1_epi8(1)); +} + template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { return _mm256_castps_si256(a); } diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h index 755cef58c..60f49a3d9 100644 --- a/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -26,7 +26,7 @@ struct type_casting_traits { }; template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -40,7 +40,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcast(const Packe } template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16b& a) { - return _mm512_cvtepi32_ps(_mm512_cvtepi8_epi32(a)); + return _mm512_cvtepi32_ps(_mm512_and_si512(_mm512_cvtepi8_epi32(a), _mm512_set1_epi32(1))); } template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { @@ -51,6 +51,10 @@ template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packe return _mm512_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet8d& a, const Packet8d& b) { + return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b)); +} + template<> EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { return _mm512_castps_si512(a); } diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index a6346ea0e..2ab09434f 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -17,6 +17,15 @@ namespace Eigen { namespace internal { #ifndef EIGEN_VECTORIZE_AVX +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 4, + TgtCoeffRatio = 1 + }; +}; + template <> struct type_casting_traits { enum { @@ -35,6 +44,16 @@ struct type_casting_traits { }; }; +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; +#endif + template <> struct type_casting_traits { enum { @@ -45,14 +64,20 @@ struct type_casting_traits { }; template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; -#endif +EIGEN_STRONG_INLINE Packet16b pcast(const Packet4f& a, + const Packet4f& b, + const Packet4f& c, + const Packet4f& d) { + __m128 zero = pzero(a); + __m128 nonzero_a = _mm_cmpneq_ps(a, zero); + __m128 nonzero_b = _mm_cmpneq_ps(b, zero); + __m128 nonzero_c = _mm_cmpneq_ps(c, zero); + __m128 nonzero_d = _mm_cmpneq_ps(d, zero); + __m128i ab_bytes = _mm_packs_epi32(_mm_castps_si128(nonzero_a), _mm_castps_si128(nonzero_b)); + __m128i cd_bytes = _mm_packs_epi32(_mm_castps_si128(nonzero_c), _mm_castps_si128(nonzero_d)); + __m128i merged = _mm_packs_epi16(ab_bytes, cd_bytes); + return _mm_and_si128(merged, _mm_set1_epi8(1)); +} template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { return _mm_cvttps_epi32(a);