This is the performance critical code. It would be cool if the inner-most loop here could be parallelized:
for _ in 0..log_2_of_n {
        let w_m = omega.mod_pow_u32(n / (2 * m));
        let mut k = 0;
        while k < n {
            let mut w = BFieldElement::one();
            for j in 0..m {
                let u = x[(k + j) as usize];
                let mut v = x[(k + j + m) as usize];
                v *= w;
                x[(k + j) as usize] = u + v;
                x[(k + j + m) as usize] = u - v;
                w *= w_m;
            }
            k += 2 * m;
        }
        m *= 2;
    }