Skip to content

Commit f56bd1e

Browse files
speed up
1 parent 42882a7 commit f56bd1e

File tree

1 file changed

+40
-40
lines changed

1 file changed

+40
-40
lines changed

prmers.cl

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,18 +1132,25 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11321132
uint id = get_global_id(0);
11331133
uint base_idx = id * 8;
11341134
ulong8 X = vload8(0, x + base_idx);
1135-
ulong4 tmp, tmp2, twiddles, twiddles2, uu, r;
1135+
ulong4 tmp, tmp2, twiddles, uu, r;
11361136
ulong a, b, d, e, s, r0, r1;
11371137
uint k, j, i;
11381138
uint k2, j2, ii;
1139-
uint twiddle_offset;
1139+
1140+
1141+
ulong4 tmp_w12 = vload4(0, w + 12);
1142+
ulong4 twiddles_w12 = (ulong4)(1UL, tmp_w12.s1, tmp_w12.s0, tmp_w12.s2);
1143+
ulong4 tmp_w15 = vload4(0, w + 15);
1144+
ulong4 twiddles_w15 = (ulong4)(1UL, tmp_w15.s1, tmp_w15.s0, tmp_w15.s2);
1145+
1146+
ulong4 tmp_wi12 = vload4(0, wi + 12);
1147+
ulong4 twiddles_wi12 = (ulong4)(1UL, tmp_wi12.s1, tmp_wi12.s0, tmp_wi12.s2);
1148+
ulong4 tmp_wi15 = vload4(0, wi + 15);
1149+
ulong4 twiddles_wi15 = (ulong4)(1UL, tmp_wi15.s1, tmp_wi15.s0, tmp_wi15.s2);
11401150

11411151
k = id * 2;
11421152
j = k & (m - 1);
11431153
i = 4 * (k - j) + j;
1144-
twiddle_offset = 6 * m + 3 * j;
1145-
tmp = vload4(0, w + twiddle_offset);
1146-
twiddles = (ulong4)(1UL, tmp.s1, tmp.s0, tmp.s2);
11471154
{
11481155
ulong4 c = (ulong4)(X.s0, X.s2, X.s4, X.s6);
11491156
a = modAdd(c.s0, c.s2);
@@ -1154,7 +1161,7 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11541161
c.s1 = modSub(a, b);
11551162
c.s2 = modAdd(d, e);
11561163
c.s3 = modSub(d, e);
1157-
c = modMul4(c, twiddles);
1164+
c = modMul4(c, twiddles_w12);
11581165
X.s0 = c.s0;
11591166
X.s2 = c.s1;
11601167
X.s4 = c.s2;
@@ -1164,9 +1171,6 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11641171
k2 = id * 2 + 1;
11651172
j2 = k2 & (m - 1);
11661173
ii = 4 * (k2 - j2) + j2;
1167-
uint twiddle_offset2 = 6 * m + 3 * j2;
1168-
tmp2 = vload4(0, w + twiddle_offset2);
1169-
twiddles2 = (ulong4)(1UL, tmp2.s1, tmp2.s0, tmp2.s2);
11701174
{
11711175
ulong4 c2 = (ulong4)(X.s1, X.s3, X.s5, X.s7);
11721176
a = modAdd(c2.s0, c2.s2);
@@ -1177,7 +1181,7 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
11771181
c2.s1 = modSub(a, b);
11781182
c2.s2 = modAdd(d, e);
11791183
c2.s3 = modSub(d, e);
1180-
c2 = modMul4(c2, twiddles2);
1184+
c2 = modMul4(c2, twiddles_w15);
11811185
X.s1 = c2.s0;
11821186
X.s3 = c2.s1;
11831187
X.s5 = c2.s2;
@@ -1231,42 +1235,38 @@ __kernel void kernel_ntt_radix4_radix2_square_radix2_radix4(__global ulong* rest
12311235

12321236
k = id * 2;
12331237
j = k & (m - 1);
1234-
uint base = 4 * (k - j) + j;
1235-
twiddle_offset = 6 * m + 3 * j;
1236-
a = X.s0;
1237-
b = X.s2;
1238-
d = X.s4;
1239-
e = X.s6;
12401238
{
1241-
ulong4 coeff = (ulong4)(a, b, d, e);
1242-
tmp = vload4(0, wi + twiddle_offset);
1243-
twiddles = (ulong4)(1UL, tmp.s1, tmp.s0, tmp.s2);
1244-
uu = modMul4(coeff, twiddles);
1245-
r = butterfly(uu);
1246-
X.s0 = modAdd(r.s0, r.s2);
1247-
X.s2 = modAdd(r.s1, r.s3);
1248-
X.s4 = modSub(r.s0, r.s2);
1249-
X.s6 = modSub(r.s1, r.s3);
1239+
a = X.s0;
1240+
b = X.s2;
1241+
d = X.s4;
1242+
e = X.s6;
1243+
{
1244+
ulong4 coeff = (ulong4)(a, b, d, e);
1245+
coeff = modMul4(coeff, twiddles_wi12);
1246+
r = butterfly(coeff);
1247+
X.s0 = modAdd(r.s0, r.s2);
1248+
X.s2 = modAdd(r.s1, r.s3);
1249+
X.s4 = modSub(r.s0, r.s2);
1250+
X.s6 = modSub(r.s1, r.s3);
1251+
}
12501252
}
12511253

12521254
k = id * 2 + 1;
12531255
j = k & (m - 1);
1254-
base = 4 * (k - j) + j;
1255-
twiddle_offset = 6 * m + 3 * j;
1256-
a = X.s1;
1257-
b = X.s3;
1258-
d = X.s5;
1259-
e = X.s7;
12601256
{
1261-
ulong4 coeff = (ulong4)(a, b, d, e);
1262-
tmp = vload4(0, wi + twiddle_offset);
1263-
twiddles = (ulong4)(1UL, tmp.s1, tmp.s0, tmp.s2);
1264-
uu = modMul4(coeff, twiddles);
1265-
r = butterfly(uu);
1266-
X.s1 = modAdd(r.s0, r.s2);
1267-
X.s3 = modAdd(r.s1, r.s3);
1268-
X.s5 = modSub(r.s0, r.s2);
1269-
X.s7 = modSub(r.s1, r.s3);
1257+
a = X.s1;
1258+
b = X.s3;
1259+
d = X.s5;
1260+
e = X.s7;
1261+
{
1262+
ulong4 coeff = (ulong4)(a, b, d, e);
1263+
coeff = modMul4(coeff, twiddles_wi15);
1264+
r = butterfly(coeff);
1265+
X.s1 = modAdd(r.s0, r.s2);
1266+
X.s3 = modAdd(r.s1, r.s3);
1267+
X.s5 = modSub(r.s0, r.s2);
1268+
X.s7 = modSub(r.s1, r.s3);
1269+
}
12701270
}
12711271

12721272
vstore8(X, 0, x + base_idx);

0 commit comments

Comments
 (0)