@@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6449
6449
// compute mask for subtraction
6450
6450
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
6451
6451
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6452
- vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m (vmask_0, q3_0, 0x4, vl);
6452
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu (vmask_0, q3_0 , q3_0, 0x4, vl);
6453
6453
m <<= 1;
6454
6454
6455
6455
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
6456
6456
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6457
- vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m (vmask_1, q3_1, 0x4, vl);
6457
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu (vmask_1, q3_1 , q3_1, 0x4, vl);
6458
6458
m <<= 1;
6459
6459
6460
6460
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
6461
6461
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6462
- vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m (vmask_2, q3_2, 0x4, vl);
6462
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu (vmask_2, q3_2 , q3_2, 0x4, vl);
6463
6463
m <<= 1;
6464
6464
6465
6465
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
6466
6466
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6467
- vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m (vmask_3, q3_3, 0x4, vl);
6467
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu (vmask_3, q3_3 , q3_3, 0x4, vl);
6468
6468
m <<= 1;
6469
6469
6470
6470
// load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7720
7720
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
7721
7721
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
7722
7722
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7723
- vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m (vmask_1, q5_a, 16, vl);
7723
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu (vmask_1, q5_a , q5_a, 16, vl);
7724
7724
m <<= 1;
7725
7725
7726
7726
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
7727
7727
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
7728
7728
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7729
- vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m (vmask_2, q5_l, 16, vl);
7729
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu (vmask_2, q5_l , q5_l, 16, vl);
7730
7730
m <<= 1;
7731
7731
7732
7732
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
0 commit comments