Merge branch 'dev' into sim-name

accel-sim · Aug 8, 2024 · 7ef7f01 · 7ef7f01
2 parents 953b8a8 + 4a90ea7
commit 7ef7f01
Show file tree

Hide file tree

Showing 14 changed files with 264 additions and 183 deletions.
diff --git a/.github/workflows/long-tests.yml b/.github/workflows/long-tests.yml
@@ -6,7 +6,8 @@ name: Long Tests
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
-  # pull_request:
+    branches-ignore:
+      - "gh-readonly-queue**"
   merge_group:
 
   # Allows you to run this workflow manually from the Actions tab
@@ -74,7 +75,14 @@ jobs:
           ./util/plotting/plot-correlation.py -c ./statistics-archive/ubench/ampere-ubench-sass.csv -H ./hw_run/AMPERE-RTX3070/11.2/ | tee ampere-ubench-correl.txt
           ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
           rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
-          echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/"
+          BODY="Github CI - Build $GITHUB_REF SUCCESS.
+          Action link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          Branch/PR Name: $GITHUB_REF_NAME
+          Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
+          SUBJECT="Github CI - Build $GITHUB_REF SUCCESS"
+          TO="${{ secrets.GROUP_EMAIL }}"
+          echo "$BODY" | mail -s "$SUBJECT" "$TO"
+          echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
   PTX-Simulation:
     if: github.repository == 'accel-sim/accel-sim-framework'
     runs-on: tgrogers-raid
@@ -102,7 +110,7 @@ jobs:
           source ./gpu-simulator/setup_environment.sh
 
           rm -rf ./gpu-app-collection
-          git clone [email protected]:accel-sim/gpu-app-collection.git
+          git clone -b dev  [email protected]:accel-sim/gpu-app-collection.git
           source ./gpu-app-collection/src/setup_environment
           srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
           ./gpu-app-collection/get_regression_data.sh

diff --git a/.github/workflows/short-tests.yml b/.github/workflows/short-tests.yml
@@ -6,13 +6,15 @@ name: Short Tests
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
+    branches-ignore:
+      - "gh-readonly-queue**"
   pull_request:
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
+jobs:  
   SASS-Simulation:
     runs-on: ubuntu-latest
     container:
@@ -27,25 +29,48 @@ jobs:
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   PTX-Simulation:
     runs-on: ubuntu-latest
-    container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       # env:
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
       - name: Run Simulation
-        run: echo "skipped SASS-Simulation. Will perform in merge queue"
+        run: echo "skipped PTX-Simulation. Will perform in merge queue"
   Tracer-Tool:
     runs-on: ubuntu-latest
-    container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       # env:
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
       - name: Run Simulation
-        run: echo "skipped SASS-Simulation. Will perform in merge queue"
+        run: echo "skipped Tracer-Simulation. Will perform in merge queue"
+  format-code:
+    runs-on: ubuntu-latest
+    needs: [SASS-Simulation, PTX-Simulation, Tracer-Tool]
+
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+      # Other steps that change files in the repository go here
+      # …
+      - name: Run clang-format
+        run: |
+          sudo apt-get install -y clang-format
+          ./gpu-simulator/format-code.sh
+          ./util/tracer_nvbit/tracer_tool/format-code.sh
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          # Optional. Commit message for the created commit.
+          # Defaults to "Apply automatic changes"
+          commit_message: Automated clang-format
+          # Optional. Option used by `git-status` to determine if the repository is 
+          # dirty. See https://git-scm.com/docs/git-status#_options
+          status_options: '--untracked-files=no'
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -40,7 +40,7 @@ pipeline {
                 source ./gpu-simulator/setup_environment.sh
 
                 rm -rf ./gpu-app-collection
-                git clone [email protected]:accel-sim/gpu-app-collection.git
+                git clone -b dev [email protected]:accel-sim/gpu-app-collection.git
                 source ./gpu-app-collection/src/setup_environment
                 srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
                 ./gpu-app-collection/get_regression_data.sh

diff --git a/gpu-simulator/ISA_Def/ampere_opcode.h b/gpu-simulator/ISA_Def/ampere_opcode.h
@@ -136,7 +136,7 @@ static const std::unordered_map<std::string, OpcodeChar> Ampere_OpcodeMap = {
     {"CCTLT", OpcodeChar(OP_CCTLT, ALU_OP)},
 
     {"LDGDEPBAR", OpcodeChar(OP_LDGDEPBAR, ALU_OP)},
-    {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)}, 
+    {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)},
 
     // Uniform Datapath Instruction
     // UDP unit

diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc
@@ -259,10 +259,10 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       cache_op = CACHE_ALL;
       break;
     case OP_LDG:
-    // LDGSTS is loading the values needed directly from the global memory to shared memory.
-    // Before this feature, the values need to be loaded to registers first, then store to 
-    // the shared memory.
-    case OP_LDGSTS: // Add for memcpy_async
+    // LDGSTS is loading the values needed directly from the global memory to
+    // shared memory. Before this feature, the values need to be loaded to
+    // registers first, then store to the shared memory.
+    case OP_LDGSTS:  // Add for memcpy_async
     case OP_LDL:
       assert(data_size > 0);
       memory_op = memory_load;
@@ -272,8 +272,7 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       else
         space.set_type(global_space);
       // Add for LDGSTS instruction
-      if (m_opcode == OP_LDGSTS)
-        m_is_ldgsts = true;
+      if (m_opcode == OP_LDGSTS) m_is_ldgsts = true;
       // check the cache scope, if its strong GPU, then bypass L1
       if (trace.check_opcode_contain(opcode_tokens, "STRONG") &&
           trace.check_opcode_contain(opcode_tokens, "GPU")) {
@@ -369,19 +368,18 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       // barrier_type bar_type;
       // reduction_type red_type;
       break;
-    // LDGDEPBAR is to form a group containing the previous LDGSTS instructions that 
-    // have not been grouped yet. 
-    // In the implementation, a group number will be assigned once the instruction is 
-    // met.
+    // LDGDEPBAR is to form a group containing the previous LDGSTS instructions
+    // that have not been grouped yet. In the implementation, a group number
+    // will be assigned once the instruction is met.
     case OP_LDGDEPBAR:
       m_is_ldgdepbar = true;
       break;
-    // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS 
-    // instructions. It is associated with a immediate value. The immediate value 
-    // indicates the last N LDGDEPBAR groups to not wait once the instruction is met.
-    // For example, if the immediate value is 1, then the last group is able to proceed
-    // even with DEPBAR present; if the immediate value is 0, then all of the groups 
-    // need to finish before proceed. 
+    // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS
+    // instructions. It is associated with a immediate value. The immediate
+    // value indicates the last N LDGDEPBAR groups to not wait once the
+    // instruction is met. For example, if the immediate value is 1, then the
+    // last group is able to proceed even with DEPBAR present; if the immediate
+    // value is 0, then all of the groups need to finish before proceed.
     case OP_DEPBAR:
       m_is_depbar = true;
       m_depbar_group_no = trace.imm;

diff --git a/gpu-simulator/trace-parser/trace_parser.cc b/gpu-simulator/trace-parser/trace_parser.cc
@@ -12,8 +12,8 @@
 #include <vector>
 
 #include <errno.h>
-#include <unistd.h>
 #include <signal.h>
+#include <unistd.h>
 
 #include "trace_parser.h"
 
@@ -32,7 +32,10 @@ void split(const std::string &str, std::vector<std::string> &cont,
   }
 }
 
-inst_trace_t::inst_trace_t() { memadd_info = NULL; imm = 0;}
+inst_trace_t::inst_trace_t() {
+  memadd_info = NULL;
+  imm = 0;
+}
 
 inst_trace_t::~inst_trace_t() {
   if (memadd_info != NULL) delete memadd_info;
@@ -169,7 +172,7 @@ bool inst_trace_t::parse_from_string(std::string trace, unsigned trace_version,
     ss >> temp;
     sscanf(temp.c_str(), "R%d", &reg_src[i]);
   }
-   
+
   // parse mem info
   unsigned address_mode = 0;
   unsigned mem_width = 0;
@@ -287,60 +290,60 @@ kernel_trace_t *trace_parser::parse_kernel_info(
 
   std::string read_trace_cmd;
   int _l = kerneltraces_filepath.length();
-  if(_l > 3 && kerneltraces_filepath.substr(_l-3, 3) == ".xz"){
+  if (_l > 3 && kerneltraces_filepath.substr(_l - 3, 3) == ".xz") {
     // this is xz-compressed trace
     read_trace_cmd = "xz -dc " + kerneltraces_filepath;
-  } else if(_l > 7 && kerneltraces_filepath.substr(_l-7, 7) == ".traceg"){
+  } else if (_l > 7 && kerneltraces_filepath.substr(_l - 7, 7) == ".traceg") {
     // this is plain text trace
-    read_trace_cmd ="cat " + kerneltraces_filepath; 
+    read_trace_cmd = "cat " + kerneltraces_filepath;
   } else {
-    std::cerr << "Can't read trace. Only .xz and plain text are supported: " 
-              << kerneltraces_filepath <<"\n";
+    std::cerr << "Can't read trace. Only .xz and plain text are supported: "
+              << kerneltraces_filepath << "\n";
     exit(1);
   }
 
   // Create an interprocess channel, and fork out a data source process. The
   // data source process reads trace from disk, write to the channel, and the
-  // simulator process read from the channel. 
+  // simulator process read from the channel.
   int *pipefd = kernel_info->pipefd;
-  if(pipe(pipefd) != 0){
+  if (pipe(pipefd) != 0) {
     std::cerr << "Failed to create interprocess channel\n";
     perror("pipe");
     exit(1);
   }
 
   pid_t pid = fork();
-  if(pid == 0){
+  if (pid == 0) {
     // The child process is the data source. Redirect its
     // stdout to the write end of the pipe.
     close(pipefd[0]);
     dup2(pipefd[1], STDOUT_FILENO);
 
     // When using GDB, sending Ctrl+C to the simulator will send a SIGINT signal
     // to the child process as well, subsequently causing it to terminate. To
-    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal. 
+    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
     // Reference:
-    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes 
+    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
     signal(SIGINT, SIG_IGN);
 
     execle("/bin/sh", "sh", "-c", read_trace_cmd.c_str(), NULL, environ);
-    perror("execle"); // the child process shouldn't reach here if all is well.
+    perror("execle");  // the child process shouldn't reach here if all is well.
     exit(1);
   } else {
     // parent (simulator)
     close(pipefd[1]);
     dup2(pipefd[0], STDIN_FILENO);
   }
-  
-  // Parent continues from here. 
-  kernel_info->ifs = &std::cin; 
+
+  // Parent continues from here.
+  kernel_info->ifs = &std::cin;
   std::istream *ifs = kernel_info->ifs;
 
   std::cout << "Processing kernel " << kerneltraces_filepath << std::endl;
 
   std::string line;
 
-  // Important to clear the istream. Otherwise, the eofbit from the last 
+  // Important to clear the istream. Otherwise, the eofbit from the last
   // kernel may be carried over to this kernel
   ifs->clear();
   clearerr(stdin);
@@ -417,7 +420,7 @@ void trace_parser::kernel_finalizer(kernel_trace_t *trace_info) {
   // The pipe read/write end file descriptors held by the child process would
   // have been automatically closed when it terminated. But the parent
   // process may read an arbitrary amount of trace files, so it has to close
-  // all file descriptors. 
+  // all file descriptors.
   close(trace_info->pipefd[0]);
   close(trace_info->pipefd[1]);
   delete trace_info;

diff --git a/gpu-simulator/trace-parser/trace_parser.h b/gpu-simulator/trace-parser/trace_parser.h
@@ -99,12 +99,12 @@ struct kernel_trace_t {
   std::istream *ifs;
   // Anonymous pipe through which the trace is transmitted from a trace reader
   // process to the simulator process
-  int pipefd[2]={};
+  int pipefd[2] = {};
 };
 
 class trace_parser {
  public:
-  trace_parser(){}
+  trace_parser() {}
   trace_parser(const char *kernellist_filepath);
 
   std::vector<trace_command> parse_commandlist_file();

diff --git a/short-tests.sh b/short-tests.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
 	exit;

diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml
@@ -92,6 +92,13 @@ GPU_Microbenchmark:
         - shared_lat:
             - args:
               accel-sim-mem: 1G
+        - shared_bank_conflicts:
+        ## argument 1 kernel has conflicts
+            - args: 1
+              accel-sim-mem: 1G
+        ## argument 2 kernel doesn't have conflicts
+            - args: 2
+              accel-sim-mem: 1G
         - MaxFlops:
             - args:
               accel-sim-mem: 1G

diff --git a/util/job_launching/stats/example_stats.yml b/util/job_launching/stats/example_stats.yml
@@ -24,6 +24,8 @@ collect_aggregate:
     - 'total dram reads\s*=\s*(.*)'
     - 'total dram writes\s*=\s*(.*)'
     - 'kernel_launch_uid\s*=\s*(.*)'
+    - 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)'
+    - 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)'
 
 
 # These stats are reset each kernel and should not be diff'd

diff --git a/util/plotting/correl_mappings.py b/util/plotting/correl_mappings.py
@@ -512,8 +512,18 @@
         plottype="log",
         stattype="counter",
     ),
-    CorrelStat(
-        chart_name="DRAM Reads",
+    CorrelStat(chart_name="Shared Memory Bank Conflicts",
+        plotfile="shmem-bank-conflict",
+        hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\
+        + np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])",
+        hw_error=None,
+        sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])",
+        hw_name="all",
+        drophwnumbelow=0,
+        plottype="log",
+        stattype="counter"
+    ),
+	CorrelStat(chart_name="DRAM Reads",
         plotfile="dram-read-transactions",
         hw_eval='np.average(hw["dram__sectors_read.sum"])',
         hw_error=None,

diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
@@ -17,13 +17,14 @@
  *  extern "C" __device__ __noinline__
  *    To prevent "dead"-code elimination by the compiler.
  */
-extern "C" __device__ __noinline__ void instrument_inst(
-    int pred, int opcode_id, int32_t vpc, bool is_mem, uint64_t addr,
-    int32_t width, int32_t desReg, int32_t srcReg1, int32_t srcReg2,
-    int32_t srcReg3, int32_t srcReg4, int32_t srcReg5, int32_t srcNum,
-    uint64_t immediate,
-    uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
-    uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, uint32_t line_num) {
+extern "C" __device__ __noinline__ void
+instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
+                uint64_t addr, int32_t width, int32_t desReg, int32_t srcReg1,
+                int32_t srcReg2, int32_t srcReg3, int32_t srcReg4,
+                int32_t srcReg5, int32_t srcNum, uint64_t immediate,
+                uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
+                uint64_t preported_dynamic_instr_counter, uint64_t pstop_report,
+                uint32_t line_num) {
 
   const int active_mask = __ballot_sync(__activemask(), 1);
   const int predicate_mask = __ballot_sync(__activemask(), pred);