mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-13 00:21:49 +08:00
AVX512: fix psqrt and prsqrt
(grafted from 7b0630315f343422b37f62f40a039c9e725fe9e1 )
This commit is contained in:
parent
1641a6cdd5
commit
c2f9e6cb37
@ -266,8 +266,7 @@ psqrt<Packet16f>(const Packet16f& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
|
||||
_mm512_setzero_ps());
|
||||
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
@ -289,8 +288,7 @@ psqrt<Packet8d>(const Packet8d& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
|
||||
_mm512_setzero_pd());
|
||||
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
@ -333,20 +331,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
|
||||
_mm512_rsqrt14_ps(_x));
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
|
||||
Packet16f infs_and_nans = _mm512_mask_blend_ps(
|
||||
neg_mask, p16f_nan,
|
||||
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
|
||||
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -363,14 +359,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
|
||||
_mm512_rsqrt14_pd(_x));
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
|
||||
Packet8d infs_and_nans = _mm512_mask_blend_pd(
|
||||
neg_mask, p8d_nan,
|
||||
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
|
||||
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
@ -379,7 +373,7 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
|
Loading…
x
Reference in New Issue
Block a user