5757#include < unordered_map>
5858#include < unordered_set>
5959#include < filesystem>
60+ #include < regex>
6061
6162/* every tool needs to include this once */
6263#include " nvbit_tool.h"
@@ -138,6 +139,117 @@ std::string spinlock_run_dir = "./";
138139int spinlock_keep_intermediate_files = 0 ;
139140void spinlock_check ();
140141
142+ /* Kernel range filter */
143+ // Maybe move these to a util lib for all tracer tools?
144+ std::string kernel_ranges = " " ;
145+
146+ struct KernelRange {
147+ uint64_t start;
148+ uint64_t end; // UINT64_MAX means open-ended
149+ std::vector<std::regex>
150+ kernel_name_regexes; // Vector of regexes for multiple patterns
151+ };
152+ std::vector<KernelRange> g_kernel_ranges;
153+ uint64_t g_max_kernel_id = 0 ;
154+ void parse_kernel_ranges_from_env () {
155+ g_kernel_ranges.clear ();
156+ g_max_kernel_id = 0 ;
157+
158+ const char *env_var = std::getenv (" DYNAMIC_KERNEL_RANGE" );
159+ if (!env_var || std::string (env_var).empty ()) {
160+ g_kernel_ranges.push_back ({0 , 0 , {std::regex (" .*" )}}); // 0 end = trace all
161+ return ;
162+ }
163+ std::string input (env_var);
164+ std::istringstream stream (input);
165+ std::string token;
166+
167+ while (stream >> token) {
168+ if (token.empty ())
169+ continue ;
170+
171+ uint64_t start = 0 , end = 0 ;
172+ std::vector<std::regex> regexes;
173+
174+ size_t at_pos = token.find (' @' );
175+ std::string range_part, regex_part;
176+
177+ if (at_pos != std::string::npos) {
178+ range_part = token.substr (0 , at_pos);
179+ regex_part = token.substr (at_pos + 1 );
180+ } else {
181+ range_part = token;
182+ }
183+
184+ // Parse the range
185+ if (!range_part.empty ()) {
186+ size_t dash_pos = range_part.find (' -' );
187+ if (dash_pos != std::string::npos) {
188+ std::string start_str = range_part.substr (0 , dash_pos);
189+ std::string end_str = range_part.substr (dash_pos + 1 );
190+
191+ start = std::stoull (start_str);
192+ if (!end_str.empty ()) {
193+ end = std::stoull (end_str);
194+ } else {
195+ end = 0 ; // open-ended
196+ }
197+ } else {
198+ start = std::stoull (range_part);
199+ end = start;
200+ }
201+ } else {
202+ // No range → match all IDs
203+ start = 0 ;
204+ end = 0 ;
205+ }
206+
207+ // Parse the regexes
208+ if (!regex_part.empty ()) {
209+ std::istringstream regex_stream (regex_part);
210+ std::string regex_token;
211+ while (std::getline (regex_stream, regex_token, ' ,' )) {
212+ try {
213+ regexes.emplace_back (regex_token);
214+ } catch (const std::regex_error &e) {
215+ std::cerr << " Invalid regex: " << regex_token << std::endl;
216+ }
217+ }
218+ } else {
219+ regexes.emplace_back (" .*" ); // match all kernel names
220+ }
221+
222+ g_kernel_ranges.push_back ({start, end, regexes});
223+ if (end > g_max_kernel_id) {
224+ g_max_kernel_id = end;
225+ }
226+ }
227+ }
228+
229+ bool should_trace_kernel (uint64_t kernel_id, const std::string &kernel_name) {
230+ for (const auto &range : g_kernel_ranges) {
231+ // Check range for kernel ID
232+ if (range.end == 0 ) {
233+ if (kernel_id >= range.start ) {
234+ // Match any of the regexes for this range
235+ for (const auto ®ex : range.kernel_name_regexes ) {
236+ if (std::regex_match (kernel_name, regex)) {
237+ return true ;
238+ }
239+ }
240+ }
241+ } else if (kernel_id >= range.start && kernel_id <= range.end ) {
242+ // Match any of the regexes for this range
243+ for (const auto ®ex : range.kernel_name_regexes ) {
244+ if (std::regex_match (kernel_name, regex)) {
245+ return true ;
246+ }
247+ }
248+ }
249+ }
250+ return false ;
251+ }
252+
141253void * recv_thread_fun (void * args);
142254
143255void nvbit_at_init () {
@@ -152,6 +264,16 @@ void nvbit_at_init() {
152264 GET_VAR_INT (spinlock_phase, " SPINLOCK_PHASE" , 0 , " Spinlock phase" );
153265 GET_VAR_STR (spinlock_run_dir, " TRACES_FOLDER" , " Spinlock detection base directory, use the same as the traces folder" );
154266 GET_VAR_INT (spinlock_keep_intermediate_files, " SPINLOCK_KEEP_INTERMEDIATE_FILES" , 0 , " Keep intermediate files" );
267+ GET_VAR_STR (
268+ kernel_ranges, " DYNAMIC_KERNEL_RANGE" ,
269+ " Specify kernel IDs or ranges to trace. Format:\n "
270+ " - Single ID: \" 2\" traces only kernel 2.\n "
271+ " - Range: \" 5-8\" traces kernels 5 through 8 (inclusive).\n "
272+ " - Open-ended: \" 10-\" traces from kernel 10 onward.\n "
273+ " - Multiple ranges: \" 2 5-8 10-\" (space-separated).\n "
274+ " - With regex: \" 5-8@kernel_a.*,kernel_b.*\" traces kernels 5-8 "
275+ " with matching names.\n "
276+ " If unset or empty, all kernels will be traced from the beginning." );
155277 std::string pad (100 , ' -' );
156278 printf (" %s\n " , pad.c_str ());
157279
@@ -167,6 +289,9 @@ void nvbit_at_init() {
167289 if (!spinlock_run_dir.empty ()) {
168290 spinlock_run_dir += " /" ;
169291 }
292+
293+ // Parse the kernel ranges
294+ parse_kernel_ranges_from_env ();
170295}
171296
172297/* *
@@ -179,6 +304,7 @@ void nvbit_at_init() {
179304void nvbit_at_term () {
180305 // Read the spinlock_run_PHASE dir under ctx_<ctx_id> and for each unique kernel name,
181306 // we will have a vector of kernel histograms
307+ printf (" Spinlock: Start to merge histograms from %s\n " , spinlock_run_dir.c_str ());
182308 using HistogramMapByName = std::map<std::string, std::vector<KernelInstructionHistogram*>>;
183309 HistogramMapByName map;
184310
@@ -192,6 +318,7 @@ void nvbit_at_term() {
192318
193319 // Now we iterate the spinlock_run_PHASE dir under ctx_<ctx_id> folder
194320 std::string context_run_dir = folder.path ().string () + " /spinlock_run_" + std::to_string (spinlock_phase);
321+ DPRINTF (" Spinlock: Read saved histograms from %s\n " , context_run_dir.c_str ());
195322
196323 // Build this histogram vector for this context
197324 for (auto & file : std::filesystem::directory_iterator (context_run_dir)) {
@@ -201,11 +328,15 @@ void nvbit_at_term() {
201328 map[histogram->name ].push_back (histogram);
202329 }
203330 }
331+
332+ DPRINTF (" Spinlock: Read %zu kernels from %s\n " , map.size (), context_run_dir.c_str ());
333+
204334 }
205335
206336 // Now, we merge all the histograms for each kernel name
207337 std::vector<KernelInstructionHistogram*> merged_histograms;
208338 size_t id = 0 ;
339+ DPRINTF (" Spinlock: Start to merge histograms\n " );
209340 for (auto & [kernel_name, histograms] : map) {
210341 KernelInstructionHistogram* merged_histogram = new KernelInstructionHistogram ();
211342 // Set the name to the kernel name
@@ -218,6 +349,7 @@ void nvbit_at_term() {
218349 }
219350 merged_histograms.push_back (merged_histogram);
220351 }
352+ DPRINTF (" Spinlock: Merged %zu kernels\n " , merged_histograms.size ());
221353
222354 // For each merged histogram, save under spinlock_run_PHASE_merged dir
223355 std::string merged_run_dir = spinlock_run_dir + " spinlock_detection/spinlock_run_" + std::to_string (spinlock_phase) + " _merged" ;
@@ -228,6 +360,7 @@ void nvbit_at_term() {
228360 assert (false );
229361 }
230362
363+ DPRINTF (" Spinlock: Start to save merged histograms to %s\n " , merged_run_dir.c_str ());
231364 for (auto & histogram : merged_histograms) {
232365 histogram->saveToFile (merged_run_dir + " /kernel-" + std::to_string (histogram->id ) + " .histogram" );
233366 }
@@ -244,6 +377,7 @@ void nvbit_at_term() {
244377
245378 // Check for spinlock
246379 if (spinlock_phase == SPINLOCK_PHASE_CHECK) {
380+ DPRINTF (" Spinlock: Start to check for spinlock\n " );
247381 spinlock_check ();
248382 }
249383}
@@ -346,16 +480,25 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func,
346480 assert (cudaGetLastError () == cudaSuccess);
347481 }
348482
483+ // Plus 1 since tracer_tool use 1-based kernel id
484+ uint64_t kernel_id = grid_launch_id + 1 ;
485+ std::string mangled_func_name = std::string (nvbit_get_func_name (ctx, func, true ));
486+
349487 // Initialize kernel instruction histogram map
350488 if (ctx_state->instr_histogram == nullptr ) {
351- ctx_state->instr_histogram = new KernelInstructionHistogram (grid_launch_id, nvbit_get_func_name (ctx, func, true ) );
489+ ctx_state->instr_histogram = new KernelInstructionHistogram (kernel_id, mangled_func_name );
352490 } else {
353- ctx_state->instr_histogram ->reinit (grid_launch_id, nvbit_get_func_name (ctx, func, true ) );
491+ ctx_state->instr_histogram ->reinit (kernel_id, mangled_func_name );
354492 }
355493
356494 /* instrument */
357495 instrument_function_if_needed (ctx, func);
358496
497+ /* Determine if need to enable instrumentation */
498+ // Plus 1 since tracer_tool use 1-based kernel id
499+ bool enable_instrumentation = should_trace_kernel (kernel_id, mangled_func_name);
500+ bool disable_print = !enable_instrumentation;
501+
359502 int nregs = 0 ;
360503 CUDA_SAFECALL (
361504 cuFuncGetAttribute (&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, func));
@@ -379,29 +522,33 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func,
379522 if (cbid == API_CUDA_cuLaunchKernelEx_ptsz ||
380523 cbid == API_CUDA_cuLaunchKernelEx) {
381524 cuLaunchKernelEx_params* p = (cuLaunchKernelEx_params*)params;
382- printf (
383- " Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
384- " Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
385- " - block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
386- " id %ld\n " ,
387- (uint64_t )ctx, pc, func_name, grid_launch_id,
388- p->config ->gridDimX , p->config ->gridDimY ,
389- p->config ->gridDimZ , p->config ->blockDimX ,
390- p->config ->blockDimY , p->config ->blockDimZ , nregs,
391- shmem_static_nbytes + p->config ->sharedMemBytes ,
392- (uint64_t )p->config ->hStream );
525+ if (!disable_print) {
526+ printf (
527+ " Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
528+ " Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
529+ " - block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
530+ " id %ld\n " ,
531+ (uint64_t )ctx, pc, func_name, grid_launch_id,
532+ p->config ->gridDimX , p->config ->gridDimY ,
533+ p->config ->gridDimZ , p->config ->blockDimX ,
534+ p->config ->blockDimY , p->config ->blockDimZ , nregs,
535+ shmem_static_nbytes + p->config ->sharedMemBytes ,
536+ (uint64_t )p->config ->hStream );
537+ }
393538 } else {
394539 cuLaunchKernel_params* p = (cuLaunchKernel_params*)params;
395- printf (
396- " Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
397- " Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
398- " - block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
399- " id %ld\n " ,
400- (uint64_t )ctx, pc, func_name, grid_launch_id, p->gridDimX ,
401- p->gridDimY , p->gridDimZ , p->blockDimX , p->blockDimY ,
402- p->blockDimZ , nregs,
403- shmem_static_nbytes + p->sharedMemBytes ,
404- (uint64_t )p->hStream );
540+ if (!disable_print) {
541+ printf (
542+ " Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
543+ " Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
544+ " - block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
545+ " id %ld\n " ,
546+ (uint64_t )ctx, pc, func_name, grid_launch_id, p->gridDimX ,
547+ p->gridDimY , p->gridDimZ , p->blockDimX , p->blockDimY ,
548+ p->blockDimZ , nregs,
549+ shmem_static_nbytes + p->sharedMemBytes ,
550+ (uint64_t )p->hStream );
551+ }
405552 }
406553
407554 // increment grid launch id for next launch
@@ -410,8 +557,7 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func,
410557 grid_launch_id++;
411558 }
412559
413- /* enable instrumented code to run */
414- nvbit_enable_instrumented (ctx, func, true );
560+ nvbit_enable_instrumented (ctx, func, enable_instrumentation);
415561
416562 // Reset the kernel receiving done flag for new kernel launch
417563 ctx_state->kernel_receiving_done = false ;
@@ -450,8 +596,13 @@ static void leave_kernel_launch(CTXstate *ctx_state, uint64_t &grid_launch_id) {
450596 }
451597
452598 // Save the histogram to file in form of kernel-<kernel_id>.histogram
453- bool success = ctx_state->instr_histogram ->saveToFile ( folder_name + " /" + " kernel-" + std::to_string (ctx_state->instr_histogram ->id ) + " .histogram" );
454- assert (success);
599+ // if we have specified to trace this kernel
600+ uint64_t kernel_id = ctx_state->instr_histogram ->id ;
601+ bool enable_save = should_trace_kernel (kernel_id, ctx_state->instr_histogram ->name );
602+ if (enable_save) {
603+ bool success = ctx_state->instr_histogram ->saveToFile ( folder_name + " /" + " kernel-" + std::to_string (kernel_id) + " .histogram" );
604+ assert (success);
605+ }
455606}
456607
457608void nvbit_at_cuda_event (CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
0 commit comments