@@ -4079,7 +4079,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
4079
4079
return s;
4080
4080
}
4081
4081
4082
- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
4082
+ template <typename T> size_t push_constant_size(const T &t) {
4083
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4084
+ GGML_UNUSED(t);
4085
+ return sizeof(T);
4086
+ }
4087
+ template <typename T> size_t push_constant_size(const std::vector<T> &t) {
4088
+ GGML_UNUSED(t);
4089
+ return sizeof(T) * t.size();
4090
+ }
4091
+ template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
4092
+ GGML_UNUSED(t);
4093
+ return sizeof(T) * N;
4094
+ }
4095
+
4096
+ template <typename T> const T *push_constant_data(const T &t) {
4097
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4098
+ return &t;
4099
+ }
4100
+ template <typename T> const T *push_constant_data(const std::vector<T> &t) {
4101
+ return t.data();
4102
+ }
4103
+ template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
4104
+ return t.data();
4105
+ }
4106
+
4107
+ template <typename T>
4108
+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
4083
4109
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
4084
4110
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
4085
4111
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
@@ -4095,7 +4121,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
4095
4121
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
4096
4122
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
4097
4123
4098
- subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
4124
+ subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data( push_constants) );
4099
4125
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
4100
4126
subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
4101
4127
pipeline->layout,
@@ -4558,18 +4584,18 @@ static void ggml_vk_matmul(
4558
4584
ggml_vk_sync_buffers(subctx);
4559
4585
if (split_k == 1) {
4560
4586
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
4561
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), & pc, { m, n, batch });
4587
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
4562
4588
return;
4563
4589
}
4564
4590
4565
4591
GGML_ASSERT(batch_stride_d == m * n);
4566
4592
4567
4593
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
4568
4594
// Make sure enough workgroups get assigned for split k to work
4569
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), & pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4595
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4570
4596
ggml_vk_sync_buffers(subctx);
4571
4597
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
4572
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data() , { m * n * batch, 1, 1 });
4598
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
4573
4599
}
4574
4600
4575
4601
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
@@ -4617,7 +4643,7 @@ static void ggml_vk_matmul_id(
4617
4643
ggml_vk_sync_buffers(subctx);
4618
4644
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
4619
4645
nei0, nei1, nbi1, ne11, padded_n };
4620
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), & pc, { m, nei1, n_as });
4646
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
4621
4647
}
4622
4648
4623
4649
static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -4738,7 +4764,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
4738
4764
};
4739
4765
init_pushconst_fastdiv(pc);
4740
4766
ggml_vk_sync_buffers(subctx);
4741
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), & pc, elements);
4767
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
4742
4768
}
4743
4769
4744
4770
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -4757,7 +4783,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
4757
4783
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
4758
4784
4759
4785
ggml_vk_sync_buffers(subctx);
4760
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof( uint32_t), &ne , { ne, 1, 1 });
4786
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array< uint32_t, 1>{ne} , { ne, 1, 1 });
4761
4787
}
4762
4788
4763
4789
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -4957,7 +4983,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4957
4983
} else if (qx_needs_dequant) {
4958
4984
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
4959
4985
ggml_vk_sync_buffers(subctx);
4960
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data() , { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
4986
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
4961
4987
}
4962
4988
if (y_non_contig) {
4963
4989
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5173,7 +5199,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5173
5199
ggml_vk_sync_buffers(subctx);
5174
5200
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5175
5201
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5176
- sizeof(vk_mat_vec_push_constants), & pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5202
+ pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5177
5203
}
5178
5204
5179
5205
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5261,7 +5287,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
5261
5287
}
5262
5288
5263
5289
ggml_vk_sync_buffers(subctx);
5264
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), & pc, { 1, (uint32_t)ne01, workgroups_z });
5290
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
5265
5291
}
5266
5292
5267
5293
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5344,7 +5370,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5344
5370
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
5345
5371
ggml_vk_sync_buffers(subctx);
5346
5372
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5347
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), & pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5373
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5348
5374
}
5349
5375
5350
5376
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5560,7 +5586,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
5560
5586
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
5561
5587
ggml_vk_sync_buffers(subctx);
5562
5588
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
5563
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data() , { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5589
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5564
5590
}
5565
5591
if (y_non_contig) {
5566
5592
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5780,7 +5806,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
5780
5806
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5781
5807
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
5782
5808
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
5783
- sizeof(vk_mat_vec_id_push_constants), & pc, { groups_x, (uint32_t)nei0, groups_z });
5809
+ pc, { groups_x, (uint32_t)nei0, groups_z });
5784
5810
}
5785
5811
5786
5812
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
@@ -6130,7 +6156,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6130
6156
// there's no more than one tile of rows (i.e. workgroups_x would have been
6131
6157
// one). We reuse workgroups_x to mean the number of splits, so we need to
6132
6158
// cancel out the divide by wg_denoms[0].
6133
- sizeof(vk_flash_attn_push_constants), & pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6159
+ pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6134
6160
6135
6161
ggml_vk_sync_buffers(subctx);
6136
6162
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
@@ -6139,7 +6165,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6139
6165
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
6140
6166
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6141
6167
},
6142
- pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data() , { (uint32_t)ne1, 1, 1 });
6168
+ pc2, { (uint32_t)ne1, 1, 1 });
6143
6169
} else {
6144
6170
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
6145
6171
{
@@ -6149,7 +6175,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6149
6175
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
6150
6176
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6151
6177
},
6152
- sizeof(vk_flash_attn_push_constants), & pc, { workgroups_x, workgroups_y, workgroups_z });
6178
+ pc, { workgroups_x, workgroups_y, workgroups_z });
6153
6179
}
6154
6180
}
6155
6181
@@ -6827,7 +6853,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6827
6853
}
6828
6854
6829
6855
ggml_vk_sync_buffers(subctx);
6830
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6856
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6831
6857
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
6832
6858
// Empty src2 is possible in rope, but the shader needs a buffer
6833
6859
vk_subbuffer subbuf_z;
@@ -6838,26 +6864,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6838
6864
}
6839
6865
6840
6866
ggml_vk_sync_buffers(subctx);
6841
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6867
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6842
6868
} else if (op == GGML_OP_IM2COL) {
6843
6869
// im2col uses only src1 and dst buffers
6844
6870
ggml_vk_sync_buffers(subctx);
6845
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6871
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6846
6872
} else if (op == GGML_OP_COUNT_EQUAL) {
6847
6873
ggml_vk_sync_buffers(subctx);
6848
6874
// count_equal assumes that destination buffer is initialized with zeroes
6849
6875
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
6850
6876
ggml_vk_sync_buffers(subctx);
6851
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6877
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6852
6878
} else if (use_src2) {
6853
6879
ggml_vk_sync_buffers(subctx);
6854
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6880
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6855
6881
} else if (use_src1) {
6856
6882
ggml_vk_sync_buffers(subctx);
6857
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6883
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6858
6884
} else {
6859
6885
ggml_vk_sync_buffers(subctx);
6860
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), & pc, elements);
6886
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6861
6887
}
6862
6888
}
6863
6889
@@ -7026,7 +7052,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
7026
7052
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
7027
7053
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7028
7054
vk_subbuffer{ d_D, dst_offset, dst_size }
7029
- }, sizeof(vk_op_rwkv_wkv6_push_constants), & pc, elements);
7055
+ }, pc, elements);
7030
7056
} else if (version == 7) {
7031
7057
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
7032
7058
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
@@ -7037,7 +7063,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
7037
7063
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7038
7064
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
7039
7065
vk_subbuffer{ d_D, dst_offset, dst_size }
7040
- }, sizeof(vk_op_rwkv_wkv7_push_constants), & pc, elements);
7066
+ }, pc, elements);
7041
7067
} else {
7042
7068
// shouldn't happen
7043
7069
GGML_ASSERT(false);
@@ -7174,7 +7200,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
7174
7200
vk_subbuffer{ d_GM, gm_offset, gm_size },
7175
7201
vk_subbuffer{ d_GV, gv_offset, gv_size },
7176
7202
vk_subbuffer{ d_P, p_offset, p_size },
7177
- }, sizeof(vk_op_push_constants), & pc, elements);
7203
+ }, pc, elements);
7178
7204
}
7179
7205
7180
7206
static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
@@ -8063,7 +8089,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8063
8089
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8064
8090
ggml_vk_ctx_begin(ctx->device, subctx);
8065
8091
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
8066
- ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data() , { (uint32_t)ne, 1, 1});
8092
+ ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
8067
8093
ggml_vk_ctx_end(subctx);
8068
8094
8069
8095
auto begin = std::chrono::high_resolution_clock::now();
0 commit comments