From c2f9e6cb37e5b14f80d3dbfecb090f46782bbb00 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Apr 2018 14:12:50 +0200
Subject: [PATCH] AVX512: fix psqrt and prsqrt (grafted from
 7b0630315f343422b37f62f40a039c9e725fe9e1 )

---
 Eigen/src/Core/arch/AVX512/MathFunctions.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 399be0ee4..4695fbc81 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -266,8 +266,7 @@ psqrt<Packet16f>(const Packet16f& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
-                                     _mm512_setzero_ps());
+  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
 
   // Do a single step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
@@ -289,8 +288,7 @@ psqrt<Packet8d>(const Packet8d& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
-                                    _mm512_setzero_pd());
+  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
 
   // Do a first step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -333,20 +331,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
-                                     _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
   Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, p16f_nan,
-      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
 
   // Do a single step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }
 
 template <>
@@ -363,14 +359,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
-                                    _mm512_rsqrt14_pd(_x));
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
   Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, p8d_nan,
-      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
 
   // Do a first step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -379,7 +373,7 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
 #else
 template <>