Add CUDA context support and build configuration

dkjung · dkjung · commit 42922deae71b · 2025-11-20T15:21:57.000+09:00
This commit adds CUDA context management files (cuda_context.h and cuda_context.cpp)
that provide similar functionality to the existing OpenCL context.

The changes include:

- Implementation of CudaContext class inheriting from Context and Singleton
- CUDA kernel management and execution interface
- Build system updates to support CUDA with enable-cuda meson_options
- Conditional linking of CUDA runtime library for both Windows and Linux
- Addition of enable-cuda option in meson_options.txt
- Implementation of RMSNorm CUDA kernel and build configuration

Signed-off-by: Daekyoung Jung &lt;dk11.jung@samsung.com&gt;
diff --git a/nntrainer/engine.cpp b/nntrainer/engine.cpp
@@ -50,6 +50,12 @@ void Engine::add_default_object() {
 
   registerContext("gpu", &cl_context);
 #endif
+
+#ifdef ENABLE_CUDA
+  auto &cuda_context = nntrainer::CudaContext::Global();
+
+  registerContext("cuda", &cuda_context);
+#endif
 }
 
 void Engine::initialize() noexcept {
diff --git a/nntrainer/meson.build b/nntrainer/meson.build
@@ -95,6 +95,11 @@ foreach elem : nntrainer_elements
   nntrainer_inc_abs += meson.current_source_dir() / elem
 endforeach
 
+# Add CUDA operations subdir if CUDA is enabled
+if get_option('enable-cuda')
+  subdir('tensor/cuda_operations')
+endif
+
 nntrainer_common_sources = [
   'nntrainer_logger.cpp',
   'app_context.cpp',
@@ -114,6 +119,7 @@ endif
 if get_option('enable-cuda')
   nntrainer_headers += meson.current_source_dir() / 'cuda_context.h'
   nntrainer_common_sources += 'cuda_context.cpp'
+  extra_defines += '-DENABLE_CUDA=1'
 endif
 
 foreach s : nntrainer_common_sources
diff --git a/nntrainer/tensor/cuda_operations/meson.build b/nntrainer/tensor/cuda_operations/meson.build
@@ -0,0 +1,34 @@
+# Find CUDA compiler
+dep = dependency('cuda', version : '>=13', modules : ['cublas'])
+
+nvcc = find_program('nvcc', required: true)
+
+if nvcc.found()
+  cuda_sources = [
+    'rmsnorm_cuda.cu'
+  ]
+
+  cuda_headers = [
+    'rmsnorm_cuda.h'
+  ]
+
+  kernel_objects = []
+  foreach kernel : cuda_sources
+    obj_name = kernel.replace('.cu', '.o')
+    obj = custom_target(obj_name,
+      command: [nvcc, '-c', '-Xcompiler', '/MD', '@INPUT@', '-o', '@OUTPUT@'],
+      input: kernel,
+      output: obj_name
+    )
+    kernel_objects += obj
+  endforeach
+
+  nntrainer_sources += kernel_objects
+
+  foreach h : cuda_headers
+    nntrainer_headers += meson.current_source_dir() / h
+  endforeach
+
+else
+  message('CUDA compiler (nvcc) not found. CUDA kernels will not be compiled.')
+endif
diff --git a/nntrainer/tensor/cuda_operations/rmsnorm_cuda.cu b/nntrainer/tensor/cuda_operations/rmsnorm_cuda.cu
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2025 Samsung Electronics Co., Ltd. All Rights Reserved.
+ *
+ * @file	rmsnorm_cuda.cpp
+ * @date	14 Nov 2025
+ * @brief	Common blas CUDA kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Samsung Electronics Co., Ltd.
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#include "rmsnorm_cuda.h"
+#include <cuda_runtime.h>
+
+ __global__ void rmsnorm_cuda_kernel(const float *input, float *output,
+                                    const float *alpha, float epsilon,
+                                    int H, int W) {
+  // Each block processes one row (height index)
+  int h = blockIdx.x;
+  int index = h * W;
+  
+  // Shared memory for reduction
+  extern __shared__ float sdata[];
+  
+  // Thread index within block
+  int tid = threadIdx.x;
+  const int blockSize = blockDim.x;
+  
+  // Load input data and compute sum of squares
+  const float *in = input + index;
+  float sum_squares = 0.0f;
+  
+  // Each thread processes multiple elements if W > blockSize
+  for (int i = tid; i < W; i += blockSize) {
+    float val = in[i];
+    sum_squares += val * val;
+  }
+  
+  // Store partial sum in shared memory
+  sdata[tid] = sum_squares;
+  __syncthreads();
+  
+  // Reduction in shared memory
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] += sdata[tid + s];
+    }
+    __syncthreads();
+  }
+  
+  // First thread in block computes the final result
+  if (tid == 0) {
+    float mean = sdata[0] / W;
+    float scale = 1.0f / sqrtf(mean + epsilon);
+    
+    // Store the scale value in shared memory for reuse
+    sdata[0] = scale;
+  }
+  __syncthreads();
+  
+  // Load the computed scale
+  float scale = sdata[0];
+  
+  // Compute output values
+  float *out = output + index;
+  for (int i = tid; i < W; i += blockSize) {
+    out[i] = in[i] * scale * alpha[i];
+  }
+}
+
+namespace nntrainer {
+
+void rmsnorm_cuda(const float *input, const float *gamma, float *result,
+                  const float epsilon, unsigned int height, unsigned int width) {
+  // Define block size
+  const int blockSize = 256;
+  
+  // Calculate grid size (one block per row)
+  const int gridSize = height;
+  
+  // Shared memory size for reduction
+  const int sharedMemSize = blockSize * sizeof(float);
+  
+  // Launch the CUDA kernel
+  rmsnorm_cuda_kernel<<<gridSize, blockSize, sharedMemSize>>>(
+    input, result, gamma, epsilon, height, width);
+}
+
+void sscal_cuda(float *X, const unsigned int N, const float alpha) {
+  // TODO: Implement CUDA kernel for sscal
+}
+
+} // namespace nntrainer
diff --git a/nntrainer/tensor/cuda_operations/rmsnorm_cuda.h b/nntrainer/tensor/cuda_operations/rmsnorm_cuda.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2025 Samsung Electronics Co., Ltd. All Rights Reserved.
+ *
+ * @file	rmsnorm_cuda.h
+ * @date	14 Nov 2025
+ * @brief	Common blas CUDA kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Samsung Electronics Co., Ltd.
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#pragma once
+
+namespace nntrainer {
+
+/**
+ * @brief rmsnorm each row of the tensor
+ * @param[in] input float * for input
+ * @param[in] gamma float * for gamma multiplier for each row
+ * @param[in] result float * for result
+ * @param[in] epsilon epsilon to add to each row sum to prevent division by zero
+ * @param[in] height height of the tensor
+ * @param[in] width width of the tensor
+ */
+void rmsnorm_cuda(const float *input, const float *gamma, float *result,
+                  const float epsilon, unsigned int height, unsigned int width);
+
+/**
+ * @brief     sscal value element by element immediately
+ * @param[in] X float * input
+ * @param[in] N unsigned int number of elements
+ * @param[in] alpha float multiplier
+ * @param[in] context RunLayerContext reference
+ */
+void sscal_cuda(float *X, const unsigned int N, const float alpha);
+
+} // namespace nntrainer