@@ -1132,18 +1132,25 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11321132 uint id = get_global_id (0 );
11331133 uint base_idx = id * 8 ;
11341134 ulong8 X = vload8 (0 , x + base_idx );
1135- ulong4 tmp , tmp2 , twiddles , twiddles2 , uu , r ;
1135+ ulong4 tmp , tmp2 , twiddles , uu , r ;
11361136 ulong a , b , d , e , s , r0 , r1 ;
11371137 uint k , j , i ;
11381138 uint k2 , j2 , ii ;
1139- uint twiddle_offset ;
1139+
1140+
1141+ ulong4 tmp_w12 = vload4 (0 , w + 12 );
1142+ ulong4 twiddles_w12 = (ulong4 )(1UL , tmp_w12 .s1 , tmp_w12 .s0 , tmp_w12 .s2 );
1143+ ulong4 tmp_w15 = vload4 (0 , w + 15 );
1144+ ulong4 twiddles_w15 = (ulong4 )(1UL , tmp_w15 .s1 , tmp_w15 .s0 , tmp_w15 .s2 );
1145+
1146+ ulong4 tmp_wi12 = vload4 (0 , wi + 12 );
1147+ ulong4 twiddles_wi12 = (ulong4 )(1UL , tmp_wi12 .s1 , tmp_wi12 .s0 , tmp_wi12 .s2 );
1148+ ulong4 tmp_wi15 = vload4 (0 , wi + 15 );
1149+ ulong4 twiddles_wi15 = (ulong4 )(1UL , tmp_wi15 .s1 , tmp_wi15 .s0 , tmp_wi15 .s2 );
11401150
11411151 k = id * 2 ;
11421152 j = k & (m - 1 );
11431153 i = 4 * (k - j ) + j ;
1144- twiddle_offset = 6 * m + 3 * j ;
1145- tmp = vload4 (0 , w + twiddle_offset );
1146- twiddles = (ulong4 )(1UL , tmp .s1 , tmp .s0 , tmp .s2 );
11471154 {
11481155 ulong4 c = (ulong4 )(X .s0 , X .s2 , X .s4 , X .s6 );
11491156 a = modAdd (c .s0 , c .s2 );
@@ -1154,7 +1161,7 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11541161 c .s1 = modSub (a , b );
11551162 c .s2 = modAdd (d , e );
11561163 c .s3 = modSub (d , e );
1157- c = modMul4 (c , twiddles );
1164+ c = modMul4 (c , twiddles_w12 );
11581165 X .s0 = c .s0 ;
11591166 X .s2 = c .s1 ;
11601167 X .s4 = c .s2 ;
@@ -1164,9 +1171,6 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11641171 k2 = id * 2 + 1 ;
11651172 j2 = k2 & (m - 1 );
11661173 ii = 4 * (k2 - j2 ) + j2 ;
1167- uint twiddle_offset2 = 6 * m + 3 * j2 ;
1168- tmp2 = vload4 (0 , w + twiddle_offset2 );
1169- twiddles2 = (ulong4 )(1UL , tmp2 .s1 , tmp2 .s0 , tmp2 .s2 );
11701174 {
11711175 ulong4 c2 = (ulong4 )(X .s1 , X .s3 , X .s5 , X .s7 );
11721176 a = modAdd (c2 .s0 , c2 .s2 );
@@ -1177,7 +1181,7 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11771181 c2 .s1 = modSub (a , b );
11781182 c2 .s2 = modAdd (d , e );
11791183 c2 .s3 = modSub (d , e );
1180- c2 = modMul4 (c2 , twiddles2 );
1184+ c2 = modMul4 (c2 , twiddles_w15 );
11811185 X .s1 = c2 .s0 ;
11821186 X .s3 = c2 .s1 ;
11831187 X .s5 = c2 .s2 ;
@@ -1231,42 +1235,38 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
12311235
12321236 k = id * 2 ;
12331237 j = k & (m - 1 );
1234- uint base = 4 * (k - j ) + j ;
1235- twiddle_offset = 6 * m + 3 * j ;
1236- a = X .s0 ;
1237- b = X .s2 ;
1238- d = X .s4 ;
1239- e = X .s6 ;
12401238 {
1241- ulong4 coeff = (ulong4 )(a , b , d , e );
1242- tmp = vload4 (0 , wi + twiddle_offset );
1243- twiddles = (ulong4 )(1UL , tmp .s1 , tmp .s0 , tmp .s2 );
1244- uu = modMul4 (coeff , twiddles );
1245- r = butterfly (uu );
1246- X .s0 = modAdd (r .s0 , r .s2 );
1247- X .s2 = modAdd (r .s1 , r .s3 );
1248- X .s4 = modSub (r .s0 , r .s2 );
1249- X .s6 = modSub (r .s1 , r .s3 );
1239+ a = X .s0 ;
1240+ b = X .s2 ;
1241+ d = X .s4 ;
1242+ e = X .s6 ;
1243+ {
1244+ ulong4 coeff = (ulong4 )(a , b , d , e );
1245+ coeff = modMul4 (coeff , twiddles_wi12 );
1246+ r = butterfly (coeff );
1247+ X .s0 = modAdd (r .s0 , r .s2 );
1248+ X .s2 = modAdd (r .s1 , r .s3 );
1249+ X .s4 = modSub (r .s0 , r .s2 );
1250+ X .s6 = modSub (r .s1 , r .s3 );
1251+ }
12501252 }
12511253
12521254 k = id * 2 + 1 ;
12531255 j = k & (m - 1 );
1254- base = 4 * (k - j ) + j ;
1255- twiddle_offset = 6 * m + 3 * j ;
1256- a = X .s1 ;
1257- b = X .s3 ;
1258- d = X .s5 ;
1259- e = X .s7 ;
12601256 {
1261- ulong4 coeff = (ulong4 )(a , b , d , e );
1262- tmp = vload4 (0 , wi + twiddle_offset );
1263- twiddles = (ulong4 )(1UL , tmp .s1 , tmp .s0 , tmp .s2 );
1264- uu = modMul4 (coeff , twiddles );
1265- r = butterfly (uu );
1266- X .s1 = modAdd (r .s0 , r .s2 );
1267- X .s3 = modAdd (r .s1 , r .s3 );
1268- X .s5 = modSub (r .s0 , r .s2 );
1269- X .s7 = modSub (r .s1 , r .s3 );
1257+ a = X .s1 ;
1258+ b = X .s3 ;
1259+ d = X .s5 ;
1260+ e = X .s7 ;
1261+ {
1262+ ulong4 coeff = (ulong4 )(a , b , d , e );
1263+ coeff = modMul4 (coeff , twiddles_wi15 );
1264+ r = butterfly (coeff );
1265+ X .s1 = modAdd (r .s0 , r .s2 );
1266+ X .s3 = modAdd (r .s1 , r .s3 );
1267+ X .s5 = modSub (r .s0 , r .s2 );
1268+ X .s7 = modSub (r .s1 , r .s3 );
1269+ }
12701270 }
12711271
12721272 vstore8 (X , 0 , x + base_idx );
0 commit comments