diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 4c3703238cb6e..bed4fbaec5ddd 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -118,13 +118,28 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr const size_t s12 = nb12 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1); - k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, + // CUDA grids typically cannot exceed 65535 in .y dimension + static const int64_t MAX_GRID_Y = 65535; + for (int64_t startY = 0; startY < ne10; startY += MAX_GRID_Y) { + int64_t chunkSizeY = std::min(MAX_GRID_Y, ne10 - startY); + + // launch chunkSizeY blocks in the Y-dimension instead of the entire ne10 + dim3 block_nums(block_num_x, chunkSizeY, ne11 * ne12); + + // pointer offset: since the kernel calculates src1[i10*s10 + ...] and dst[i10*s1 + ...], + // we shift src1_dd/dst_dd by (startY * stride) so that when i10=0 in the kernel, + // it corresponds to row "startY" in the overall space. + const int32_t * src1_dd_offset = src1_dd + startY * s10; + float * dst_dd_offset = dst_dd + startY * s1; + + k_get_rows_float<<>>( + src0_dd, src1_dd_offset, dst_dd_offset, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); + } GGML_UNUSED(dst); }