diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index b125d5974..cc8229708 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -22,7 +22,15 @@ namespace Eigen { namespace internal { EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet8f) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4d) + +EIGEN_DOUBLE_PACKET_FUNCTION(atan, Packet4d); +EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet4d); +EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d); +EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d); +#ifdef EIGEN_VECTORIZE_AVX2 +EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d); +EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d); +#endif // Notice that for newer processors, it is counterproductive to use Newton // iteration for square root. In particular, Skylake and Zen2 processors diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 5dd5ce2e9..6df332470 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -142,8 +142,10 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, +#ifdef EIGEN_VECTORIZE_AVX2 HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, +#endif HasLog = 1, HasExp = 1, HasSqrt = 1, @@ -2135,20 +2137,33 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template <> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { +#ifdef EIGEN_VECTORIZE_AVX2 const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m256i true_mask = _mm256_sub_epi32(_mm256_setzero_si256(), select); - return pselect(_mm256_castsi256_ps(true_mask), thenPacket, elsePacket); + const __m256 true_mask = _mm256_castsi256_ps(_mm256_sub_epi32(_mm256_setzero_si256(), select)); +#else + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], + ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + const __m256 true_mask = _mm256_cmp_ps(select, _mm256_setzero_ps(), _CMP_NEQ_UQ); +#endif + + return pselect(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { +#ifdef EIGEN_VECTORIZE_AVX2 const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m256i true_mask = _mm256_sub_epi64(_mm256_setzero_si256(), select); - return pselect(_mm256_castsi256_pd(true_mask), thenPacket, elsePacket); + const __m256d true_mask = _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), select)); +#else + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d true_mask = _mm256_cmp_pd(select, _mm256_setzero_pd(), _CMP_NEQ_UQ); +#endif + + return pselect(true_mask, thenPacket, elsePacket); } // Packet math for Eigen::half