AVX2 - double->int64_t casting

This commit is contained in:
Charles Schlosser 2024-03-29 21:35:09 +00:00
parent 13092b5d04
commit f75e2297db
2 changed files with 41 additions and 15 deletions

View File

@ -560,7 +560,7 @@ EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmeti
}
template <int N>
EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
}
template <int N>
EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
@ -1802,14 +1802,12 @@ EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
// pabs should be ok
template <>
EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
return _mm256_and_ps(a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF,
0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
return _mm256_and_pd(a, mask);
}
template <>
@ -1829,28 +1827,28 @@ EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
template <>
EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
return _mm_srai_epi16(a, 15);
return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
}
template <>
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
return _mm_srai_epi16(a, 15);
return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
}
template <>
EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a)));
return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
}
template <>
EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) {
return pzero(a);
return _mm256_setzero_si256();
}
#ifdef EIGEN_VECTORIZE_AVX2
template <>
EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a)));
return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
}
template <>
EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) {
return pzero(a);
return _mm256_setzero_si256();
}
#endif

View File

@ -200,10 +200,38 @@ EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
return _mm256_cvttpd_epi64(a);
#else
EIGEN_ALIGN16 double aux[4];
pstore(aux, a);
return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
static_cast<int64_t>(aux[0]));
// if 'a' exceeds the numerical limits of int64_t, the behavior is undefined
// e <= 0 corresponds to |a| < 1, which should result in zero. incidentally, intel intrinsics with shift arguments
// greater than or equal to 64 produce zero. furthermore, negative shifts appear to be interpreted as large positive
// shifts (two's complement), which also result in zero. therefore, e does not need to be clamped to [0, 64)
constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
const __m256i cst_one = _mm256_set1_epi64x(1);
const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits);
const __m256i cst_bias = _mm256_set1_epi64x(kBias);
__m256i a_bits = _mm256_castpd_si256(a);
// shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
__m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1);
__m256i e = _mm256_sub_epi64(biased_e, cst_bias);
// shift to the left by kExponentBits + 1 to clear the sign and exponent bits
__m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1);
// shift to the right by kTotalBits - e to convert the significand to an integer
__m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e));
// add the implied bit
__m256i result_exponent = _mm256_sllv_epi64(cst_one, e);
// e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
__m256i result = _mm256_add_epi64(result_significand, result_exponent);
// handle negative arguments
__m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits);
result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask);
return result;
#endif
}