@@ -118,13 +118,28 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr
118
118
const size_t s12 = nb12 / ggml_element_size (src1);
119
119
// const size_t s13 = nb13 / ggml_element_size(src1);
120
120
121
- k_get_rows_float<<<block_nums, block_dims, 0 , stream>>> (
122
- src0_dd, src1_dd, dst_dd,
121
+ // CUDA grids typically cannot exceed 65535 in .y dimension
122
+ static const int64_t MAX_GRID_Y = 65535 ;
123
+ for (int64_t startY = 0 ; startY < ne10; startY += MAX_GRID_Y) {
124
+ int64_t chunkSizeY = std::min<int64_t >(MAX_GRID_Y, ne10 - startY);
125
+
126
+ // launch chunkSizeY blocks in the Y-dimension instead of the entire ne10
127
+ dim3 block_nums (block_num_x, chunkSizeY, ne11 * ne12);
128
+
129
+ // pointer offset: since the kernel calculates src1[i10*s10 + ...] and dst[i10*s1 + ...],
130
+ // we shift src1_dd/dst_dd by (startY * stride) so that when i10=0 in the kernel,
131
+ // it corresponds to row "startY" in the overall space.
132
+ const int32_t * src1_dd_offset = src1_dd + startY * s10;
133
+ float * dst_dd_offset = dst_dd + startY * s1;
134
+
135
+ k_get_rows_float<<<block_nums, block_dims, 0 , stream>>> (
136
+ src0_dd, src1_dd_offset, dst_dd_offset,
123
137
ne00, /* ne01, ne02, ne03,*/
124
138
/* ne10, ne11,*/ ne12, /* ne13,*/
125
139
/* s0,*/ s1, s2, s3,
126
140
/* nb00,*/ nb01, nb02, nb03,
127
141
s10, s11, s12/* , s13*/ );
142
+ }
128
143
129
144
GGML_UNUSED (dst);
130
145
}
0 commit comments