Skip to content

Commit badbdfc

Browse files
committedJan 11, 2025
CUDA op getrows fails for long sequences
T5 embeddings have a square input pos tensor which quickly exceeds the 65k limit of getrows Implemented only for _float, need other implementations
1 parent 503c282 commit badbdfc

File tree

1 file changed

+17
-2
lines changed

1 file changed

+17
-2
lines changed
 

Diff for: ‎ggml/src/ggml-cuda/getrows.cu

+17-2
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,28 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr
118118
const size_t s12 = nb12 / ggml_element_size(src1);
119119
//const size_t s13 = nb13 / ggml_element_size(src1);
120120

121-
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
122-
src0_dd, src1_dd, dst_dd,
121+
// CUDA grids typically cannot exceed 65535 in .y dimension
122+
static const int64_t MAX_GRID_Y = 65535;
123+
for (int64_t startY = 0; startY < ne10; startY += MAX_GRID_Y) {
124+
int64_t chunkSizeY = std::min<int64_t>(MAX_GRID_Y, ne10 - startY);
125+
126+
// launch chunkSizeY blocks in the Y-dimension instead of the entire ne10
127+
dim3 block_nums(block_num_x, chunkSizeY, ne11 * ne12);
128+
129+
// pointer offset: since the kernel calculates src1[i10*s10 + ...] and dst[i10*s1 + ...],
130+
// we shift src1_dd/dst_dd by (startY * stride) so that when i10=0 in the kernel,
131+
// it corresponds to row "startY" in the overall space.
132+
const int32_t * src1_dd_offset = src1_dd + startY * s10;
133+
float * dst_dd_offset = dst_dd + startY * s1;
134+
135+
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
136+
src0_dd, src1_dd_offset, dst_dd_offset,
123137
ne00, /*ne01, ne02, ne03,*/
124138
/*ne10, ne11,*/ ne12, /*ne13,*/
125139
/* s0,*/ s1, s2, s3,
126140
/* nb00,*/ nb01, nb02, nb03,
127141
s10, s11, s12/*, s13*/);
142+
}
128143

129144
GGML_UNUSED(dst);
130145
}

0 commit comments

Comments
 (0)
Please sign in to comment.