@@ -310,18 +310,16 @@ volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_
310310 for (size_t vl; n > 0 ; n -= vl, aVector += vl, cVector += vl) {
311311 vl = __riscv_vsetvl_e32m8 (n);
312312 vfloat32m8_t a = __riscv_vle32_v_f32m8 (aVector, vl);
313+ vfloat32m8_t half = __riscv_vfmv_v_f_f32m8 (0 .5f , vl);
314+ vfloat32m8_t three_halfs = __riscv_vfmv_v_f_f32m8 (1 .5f , vl);
313315 // Initial estimate (~7-bit precision)
314316 vfloat32m8_t x = __riscv_vfrsqrt7 (a, vl);
315317 // Two Newton-Raphson iterations: x = x * (1.5 - 0.5 * a * x * x)
316- vfloat32m8_t ax = __riscv_vfmul (a, x, vl);
317- vfloat32m8_t half_ax = __riscv_vfmul_vf (ax, 0 .5f , vl);
318+ vfloat32m8_t half_a = __riscv_vfmul (half, a, vl);
318319 x = __riscv_vfmul (
319- x, __riscv_vfnmsac_vf (__riscv_vfmv_v_f_f32m8 (1 .5f , vl), half_ax, x, vl), vl);
320- // Second iteration
321- ax = __riscv_vfmul (a, x, vl);
322- half_ax = __riscv_vfmul_vf (ax, 0 .5f , vl);
320+ x, __riscv_vfnmsac (three_halfs, half_a, __riscv_vfmul (x, x, vl), vl), vl);
323321 x = __riscv_vfmul (
324- x, __riscv_vfnmsac_vf ( __riscv_vfmv_v_f_f32m8 ( 1 . 5f , vl), half_ax , x, vl), vl);
322+ x, __riscv_vfnmsac (three_halfs, half_a, __riscv_vfmul (x , x, vl) , vl), vl);
325323 __riscv_vse32 (cVector, x, vl);
326324 }
327325}
0 commit comments