Avoid underflow in prsqrt.

This commit is contained in:
Rasmus Munk Larsen 2023-06-06 14:06:19 -07:00
parent b7151ffaab
commit 7d7576f326

View File

@ -87,11 +87,11 @@ struct generic_rsqrt_newton_step {
Packet inv_sqrt = approx_rsqrt;
for (int step = 0; step < Steps; ++step) {
// Refine the approximation using one Newton-Raphson step:
// h_n = x * (inv_sqrt * inv_sqrt) - 1 (so that h_n is nearly 0).
// h_n = (x * inv_sqrt) * inv_sqrt - 1 (so that h_n is nearly 0).
// inv_sqrt = inv_sqrt - 0.5 * inv_sqrt * h_n
Packet r2 = pmul(inv_sqrt, inv_sqrt);
Packet r2 = pmul(a, inv_sqrt);
Packet half_r = pmul(inv_sqrt, cst_minus_half);
Packet h_n = pmadd(a, r2, cst_minus_one);
Packet h_n = pmadd(r2, inv_sqrt, cst_minus_one);
inv_sqrt = pmadd(half_r, h_n, inv_sqrt);
}