diff --git a/.github/workflows/long-tests.yml b/.github/workflows/long-tests.yml
index 7007a81ed..71a0a7109 100644
--- a/.github/workflows/long-tests.yml
+++ b/.github/workflows/long-tests.yml
@@ -6,7 +6,8 @@ name: Long Tests
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
-  # pull_request:
+    branches-ignore:
+      - "gh-readonly-queue**"
   merge_group:
 
   # Allows you to run this workflow manually from the Actions tab
@@ -74,7 +75,14 @@ jobs:
           ./util/plotting/plot-correlation.py -c ./statistics-archive/ubench/ampere-ubench-sass.csv -H ./hw_run/AMPERE-RTX3070/11.2/ | tee ampere-ubench-correl.txt
           ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
           rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
-          echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/"
+          BODY="Github CI - Build $GITHUB_REF SUCCESS.
+          Action link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          Branch/PR Name: $GITHUB_REF_NAME
+          Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
+          SUBJECT="Github CI - Build $GITHUB_REF SUCCESS"
+          TO="${{ secrets.GROUP_EMAIL }}"
+          echo "$BODY" | mail -s "$SUBJECT" "$TO"
+          echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
   PTX-Simulation:
     if: github.repository == 'accel-sim/accel-sim-framework'
     runs-on: tgrogers-raid
@@ -102,7 +110,7 @@ jobs:
           source ./gpu-simulator/setup_environment.sh
 
           rm -rf ./gpu-app-collection
-          git clone git@github.com:accel-sim/gpu-app-collection.git
+          git clone -b dev  git@github.com:accel-sim/gpu-app-collection.git
           source ./gpu-app-collection/src/setup_environment
           srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
           ./gpu-app-collection/get_regression_data.sh
diff --git a/.github/workflows/short-tests.yml b/.github/workflows/short-tests.yml
index c2fbb13fb..2a958b2f5 100644
--- a/.github/workflows/short-tests.yml
+++ b/.github/workflows/short-tests.yml
@@ -6,13 +6,15 @@ name: Short Tests
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
+    branches-ignore:
+      - "gh-readonly-queue**"
   pull_request:
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
+jobs:  
   SASS-Simulation:
     runs-on: ubuntu-latest
     container:
@@ -27,8 +29,6 @@ jobs:
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   PTX-Simulation:
     runs-on: ubuntu-latest
-    container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       # env:
         
     # Steps represent a sequence of tasks that will be executed as part of the job
@@ -36,11 +36,9 @@ jobs:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
       - name: Run Simulation
-        run: echo "skipped SASS-Simulation. Will perform in merge queue"
+        run: echo "skipped PTX-Simulation. Will perform in merge queue"
   Tracer-Tool:
     runs-on: ubuntu-latest
-    container:
-      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       # env:
         
     # Steps represent a sequence of tasks that will be executed as part of the job
@@ -48,4 +46,31 @@ jobs:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
       - name: Run Simulation
-        run: echo "skipped SASS-Simulation. Will perform in merge queue"
\ No newline at end of file
+        run: echo "skipped Tracer-Simulation. Will perform in merge queue"
+  format-code:
+    runs-on: ubuntu-latest
+    needs: [SASS-Simulation, PTX-Simulation, Tracer-Tool]
+
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+      # Other steps that change files in the repository go here
+      # …
+      - name: Run clang-format
+        run: |
+          sudo apt-get install -y clang-format
+          ./gpu-simulator/format-code.sh
+          ./util/tracer_nvbit/tracer_tool/format-code.sh
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          # Optional. Commit message for the created commit.
+          # Defaults to "Apply automatic changes"
+          commit_message: Automated clang-format
+          # Optional. Option used by `git-status` to determine if the repository is 
+          # dirty. See https://git-scm.com/docs/git-status#_options
+          status_options: '--untracked-files=no'
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index 9d3e9d7e2..da62f8689 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -40,7 +40,7 @@ pipeline {
                 source ./gpu-simulator/setup_environment.sh
 
                 rm -rf ./gpu-app-collection
-                git clone git@github.com:accel-sim/gpu-app-collection.git
+                git clone -b dev git@github.com:accel-sim/gpu-app-collection.git
                 source ./gpu-app-collection/src/setup_environment
                 srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
                 ./gpu-app-collection/get_regression_data.sh
diff --git a/gpu-simulator/ISA_Def/ampere_opcode.h b/gpu-simulator/ISA_Def/ampere_opcode.h
index 8b0cabd5b..afc997df1 100644
--- a/gpu-simulator/ISA_Def/ampere_opcode.h
+++ b/gpu-simulator/ISA_Def/ampere_opcode.h
@@ -136,7 +136,7 @@ static const std::unordered_map<std::string, OpcodeChar> Ampere_OpcodeMap = {
     {"CCTLT", OpcodeChar(OP_CCTLT, ALU_OP)},
 
     {"LDGDEPBAR", OpcodeChar(OP_LDGDEPBAR, ALU_OP)},
-    {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)}, 
+    {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)},
 
     // Uniform Datapath Instruction
     // UDP unit
diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc
index 08d121af0..ab509d21f 100644
--- a/gpu-simulator/trace-driven/trace_driven.cc
+++ b/gpu-simulator/trace-driven/trace_driven.cc
@@ -259,10 +259,10 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       cache_op = CACHE_ALL;
       break;
     case OP_LDG:
-    // LDGSTS is loading the values needed directly from the global memory to shared memory.
-    // Before this feature, the values need to be loaded to registers first, then store to 
-    // the shared memory.
-    case OP_LDGSTS: // Add for memcpy_async
+    // LDGSTS is loading the values needed directly from the global memory to
+    // shared memory. Before this feature, the values need to be loaded to
+    // registers first, then store to the shared memory.
+    case OP_LDGSTS:  // Add for memcpy_async
     case OP_LDL:
       assert(data_size > 0);
       memory_op = memory_load;
@@ -272,8 +272,7 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       else
         space.set_type(global_space);
       // Add for LDGSTS instruction
-      if (m_opcode == OP_LDGSTS)
-        m_is_ldgsts = true;
+      if (m_opcode == OP_LDGSTS) m_is_ldgsts = true;
       // check the cache scope, if its strong GPU, then bypass L1
       if (trace.check_opcode_contain(opcode_tokens, "STRONG") &&
           trace.check_opcode_contain(opcode_tokens, "GPU")) {
@@ -369,19 +368,18 @@ bool trace_warp_inst_t::parse_from_trace_struct(
       // barrier_type bar_type;
       // reduction_type red_type;
       break;
-    // LDGDEPBAR is to form a group containing the previous LDGSTS instructions that 
-    // have not been grouped yet. 
-    // In the implementation, a group number will be assigned once the instruction is 
-    // met.
+    // LDGDEPBAR is to form a group containing the previous LDGSTS instructions
+    // that have not been grouped yet. In the implementation, a group number
+    // will be assigned once the instruction is met.
     case OP_LDGDEPBAR:
       m_is_ldgdepbar = true;
       break;
-    // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS 
-    // instructions. It is associated with a immediate value. The immediate value 
-    // indicates the last N LDGDEPBAR groups to not wait once the instruction is met.
-    // For example, if the immediate value is 1, then the last group is able to proceed
-    // even with DEPBAR present; if the immediate value is 0, then all of the groups 
-    // need to finish before proceed. 
+    // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS
+    // instructions. It is associated with a immediate value. The immediate
+    // value indicates the last N LDGDEPBAR groups to not wait once the
+    // instruction is met. For example, if the immediate value is 1, then the
+    // last group is able to proceed even with DEPBAR present; if the immediate
+    // value is 0, then all of the groups need to finish before proceed.
     case OP_DEPBAR:
       m_is_depbar = true;
       m_depbar_group_no = trace.imm;
diff --git a/gpu-simulator/trace-parser/trace_parser.cc b/gpu-simulator/trace-parser/trace_parser.cc
index 68cfb235d..56dda28d4 100644
--- a/gpu-simulator/trace-parser/trace_parser.cc
+++ b/gpu-simulator/trace-parser/trace_parser.cc
@@ -12,8 +12,8 @@
 #include <vector>
 
 #include <errno.h>
-#include <unistd.h>
 #include <signal.h>
+#include <unistd.h>
 
 #include "trace_parser.h"
 
@@ -32,7 +32,10 @@ void split(const std::string &str, std::vector<std::string> &cont,
   }
 }
 
-inst_trace_t::inst_trace_t() { memadd_info = NULL; imm = 0;}
+inst_trace_t::inst_trace_t() {
+  memadd_info = NULL;
+  imm = 0;
+}
 
 inst_trace_t::~inst_trace_t() {
   if (memadd_info != NULL) delete memadd_info;
@@ -169,7 +172,7 @@ bool inst_trace_t::parse_from_string(std::string trace, unsigned trace_version,
     ss >> temp;
     sscanf(temp.c_str(), "R%d", &reg_src[i]);
   }
-   
+
   // parse mem info
   unsigned address_mode = 0;
   unsigned mem_width = 0;
@@ -287,30 +290,30 @@ kernel_trace_t *trace_parser::parse_kernel_info(
 
   std::string read_trace_cmd;
   int _l = kerneltraces_filepath.length();
-  if(_l > 3 && kerneltraces_filepath.substr(_l-3, 3) == ".xz"){
+  if (_l > 3 && kerneltraces_filepath.substr(_l - 3, 3) == ".xz") {
     // this is xz-compressed trace
     read_trace_cmd = "xz -dc " + kerneltraces_filepath;
-  } else if(_l > 7 && kerneltraces_filepath.substr(_l-7, 7) == ".traceg"){
+  } else if (_l > 7 && kerneltraces_filepath.substr(_l - 7, 7) == ".traceg") {
     // this is plain text trace
-    read_trace_cmd ="cat " + kerneltraces_filepath; 
+    read_trace_cmd = "cat " + kerneltraces_filepath;
   } else {
-    std::cerr << "Can't read trace. Only .xz and plain text are supported: " 
-              << kerneltraces_filepath <<"\n";
+    std::cerr << "Can't read trace. Only .xz and plain text are supported: "
+              << kerneltraces_filepath << "\n";
     exit(1);
   }
 
   // Create an interprocess channel, and fork out a data source process. The
   // data source process reads trace from disk, write to the channel, and the
-  // simulator process read from the channel. 
+  // simulator process read from the channel.
   int *pipefd = kernel_info->pipefd;
-  if(pipe(pipefd) != 0){
+  if (pipe(pipefd) != 0) {
     std::cerr << "Failed to create interprocess channel\n";
     perror("pipe");
     exit(1);
   }
 
   pid_t pid = fork();
-  if(pid == 0){
+  if (pid == 0) {
     // The child process is the data source. Redirect its
     // stdout to the write end of the pipe.
     close(pipefd[0]);
@@ -318,29 +321,29 @@ kernel_trace_t *trace_parser::parse_kernel_info(
 
     // When using GDB, sending Ctrl+C to the simulator will send a SIGINT signal
     // to the child process as well, subsequently causing it to terminate. To
-    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal. 
+    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
     // Reference:
-    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes 
+    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
     signal(SIGINT, SIG_IGN);
 
     execle("/bin/sh", "sh", "-c", read_trace_cmd.c_str(), NULL, environ);
-    perror("execle"); // the child process shouldn't reach here if all is well.
+    perror("execle");  // the child process shouldn't reach here if all is well.
     exit(1);
   } else {
     // parent (simulator)
     close(pipefd[1]);
     dup2(pipefd[0], STDIN_FILENO);
   }
-  
-  // Parent continues from here. 
-  kernel_info->ifs = &std::cin; 
+
+  // Parent continues from here.
+  kernel_info->ifs = &std::cin;
   std::istream *ifs = kernel_info->ifs;
 
   std::cout << "Processing kernel " << kerneltraces_filepath << std::endl;
 
   std::string line;
 
-  // Important to clear the istream. Otherwise, the eofbit from the last 
+  // Important to clear the istream. Otherwise, the eofbit from the last
   // kernel may be carried over to this kernel
   ifs->clear();
   clearerr(stdin);
@@ -417,7 +420,7 @@ void trace_parser::kernel_finalizer(kernel_trace_t *trace_info) {
   // The pipe read/write end file descriptors held by the child process would
   // have been automatically closed when it terminated. But the parent
   // process may read an arbitrary amount of trace files, so it has to close
-  // all file descriptors. 
+  // all file descriptors.
   close(trace_info->pipefd[0]);
   close(trace_info->pipefd[1]);
   delete trace_info;
diff --git a/gpu-simulator/trace-parser/trace_parser.h b/gpu-simulator/trace-parser/trace_parser.h
index 4882380c6..946cf8d3d 100644
--- a/gpu-simulator/trace-parser/trace_parser.h
+++ b/gpu-simulator/trace-parser/trace_parser.h
@@ -99,12 +99,12 @@ struct kernel_trace_t {
   std::istream *ifs;
   // Anonymous pipe through which the trace is transmitted from a trace reader
   // process to the simulator process
-  int pipefd[2]={};
+  int pipefd[2] = {};
 };
 
 class trace_parser {
  public:
-  trace_parser(){}
+  trace_parser() {}
   trace_parser(const char *kernellist_filepath);
 
   std::vector<trace_command> parse_commandlist_file();
diff --git a/short-tests.sh b/short-tests.sh
index 73e07fde1..c2f357dec 100755
--- a/short-tests.sh
+++ b/short-tests.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
 	exit;
diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml
index cc3e68d45..76d9585de 100644
--- a/util/job_launching/apps/define-all-apps.yml
+++ b/util/job_launching/apps/define-all-apps.yml
@@ -92,6 +92,13 @@ GPU_Microbenchmark:
         - shared_lat:
             - args:
               accel-sim-mem: 1G
+        - shared_bank_conflicts:
+        ## argument 1 kernel has conflicts
+            - args: 1
+              accel-sim-mem: 1G
+        ## argument 2 kernel doesn't have conflicts
+            - args: 2
+              accel-sim-mem: 1G
         - MaxFlops:
             - args:
               accel-sim-mem: 1G
diff --git a/util/job_launching/stats/example_stats.yml b/util/job_launching/stats/example_stats.yml
index 5b3a52351..768b0680d 100644
--- a/util/job_launching/stats/example_stats.yml
+++ b/util/job_launching/stats/example_stats.yml
@@ -24,6 +24,8 @@ collect_aggregate:
     - 'total dram reads\s*=\s*(.*)'
     - 'total dram writes\s*=\s*(.*)'
     - 'kernel_launch_uid\s*=\s*(.*)'
+    - 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)'
+    - 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)'
 
 
 # These stats are reset each kernel and should not be diff'd
diff --git a/util/plotting/correl_mappings.py b/util/plotting/correl_mappings.py
index e474f3d8a..aa2e958e1 100644
--- a/util/plotting/correl_mappings.py
+++ b/util/plotting/correl_mappings.py
@@ -512,8 +512,18 @@
         plottype="log",
         stattype="counter",
     ),
-    CorrelStat(
-        chart_name="DRAM Reads",
+    CorrelStat(chart_name="Shared Memory Bank Conflicts",
+        plotfile="shmem-bank-conflict",
+        hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\
+        + np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])",
+        hw_error=None,
+        sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])",
+        hw_name="all",
+        drophwnumbelow=0,
+        plottype="log",
+        stattype="counter"
+    ),
+	CorrelStat(chart_name="DRAM Reads",
         plotfile="dram-read-transactions",
         hw_eval='np.average(hw["dram__sectors_read.sum"])',
         hw_error=None,
diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
index aaaf4b0b9..0bf834033 100644
--- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu
+++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
@@ -17,13 +17,14 @@
  *  extern "C" __device__ __noinline__
  *    To prevent "dead"-code elimination by the compiler.
  */
-extern "C" __device__ __noinline__ void instrument_inst(
-    int pred, int opcode_id, int32_t vpc, bool is_mem, uint64_t addr,
-    int32_t width, int32_t desReg, int32_t srcReg1, int32_t srcReg2,
-    int32_t srcReg3, int32_t srcReg4, int32_t srcReg5, int32_t srcNum,
-    uint64_t immediate,
-    uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
-    uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, uint32_t line_num) {
+extern "C" __device__ __noinline__ void
+instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
+                uint64_t addr, int32_t width, int32_t desReg, int32_t srcReg1,
+                int32_t srcReg2, int32_t srcReg3, int32_t srcReg4,
+                int32_t srcReg5, int32_t srcNum, uint64_t immediate,
+                uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
+                uint64_t preported_dynamic_instr_counter, uint64_t pstop_report,
+                uint32_t line_num) {
 
   const int active_mask = __ballot_sync(__activemask(), 1);
   const int predicate_mask = __ballot_sync(__activemask(), pred);
diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
index 2b2f78d88..4123b7221 100644
--- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu
+++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
@@ -67,7 +67,7 @@ int xz_compress_trace = 0;
 std::map<std::string, int> opcode_to_id_map;
 std::map<int, std::string> id_to_opcode_map;
 
-std::string cwd = getcwd(NULL,0);
+std::string cwd = getcwd(NULL, 0);
 std::string traces_location = cwd + "/traces/";
 std::string kernelslist_location = cwd + "/traces/kernelslist";
 std::string stats_location = cwd + "/traces/stats.csv";
@@ -90,25 +90,30 @@ void nvbit_at_init() {
               "Exclude predicated off instruction from count");
   GET_VAR_INT(lineinfo, "TRACE_LINEINFO", 0,
               "Include source code line info at the start of each traced line. "
-              "The target binary must be compiled with -lineinfo or --generate-line-info");
+              "The target binary must be compiled with -lineinfo or "
+              "--generate-line-info");
   GET_VAR_INT(dynamic_kernel_limit_end, "DYNAMIC_KERNEL_LIMIT_END", 0,
               "Limit of the number kernel to be printed, 0 means no limit");
   GET_VAR_INT(dynamic_kernel_limit_start, "DYNAMIC_KERNEL_LIMIT_START", 0,
               "start to report kernel from this kernel id, 0 means starts from "
               "the beginning, i.e. first kernel");
   GET_VAR_INT(
-         active_from_start, "ACTIVE_FROM_START", 1,
-         "Start instruction tracing from start or wait for cuProfilerStart "
-         "and cuProfilerStop. If set to 0, DYNAMIC_KERNEL_LIMIT options have no effect");
+      active_from_start, "ACTIVE_FROM_START", 1,
+      "Start instruction tracing from start or wait for cuProfilerStart "
+      "and cuProfilerStop. If set to 0, DYNAMIC_KERNEL_LIMIT options have no "
+      "effect");
   GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool");
   GET_VAR_INT(enable_compress, "TOOL_COMPRESS", 1, "Enable traces compression");
   GET_VAR_INT(print_core_id, "TOOL_TRACE_CORE", 0,
               "write the core id in the traces");
-  GET_VAR_INT(terminate_after_limit_number_of_kernels_reached, "TERMINATE_UPON_LIMIT", 0,
-              "Stop the process once the current kernel > DYNAMIC_KERNEL_LIMIT_END");
-  GET_VAR_INT(user_defined_folders, "USER_DEFINED_FOLDERS", 0, "Uses the user defined "
+  GET_VAR_INT(
+      terminate_after_limit_number_of_kernels_reached, "TERMINATE_UPON_LIMIT",
+      0, "Stop the process once the current kernel > DYNAMIC_KERNEL_LIMIT_END");
+  GET_VAR_INT(user_defined_folders, "USER_DEFINED_FOLDERS", 0,
+              "Uses the user defined "
               "folder TRACES_FOLDER path environment");
-  GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 0, "Create xz-compressed trace"
+  GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 0,
+              "Create xz-compressed trace"
               "file");
   std::string pad(100, '-');
   printf("%s\n", pad.c_str());
@@ -141,7 +146,8 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
 
     const std::vector<Instr *> &instrs = nvbit_get_instrs(ctx, f);
     if (verbose) {
-      printf("Inspecting function %s at address 0x%lx\n", nvbit_get_func_name(ctx, f), nvbit_get_func_addr(f), true);
+      printf("Inspecting function %s at address 0x%lx\n",
+             nvbit_get_func_name(ctx, f), nvbit_get_func_addr(f), true);
     }
 
     uint32_t cnt = 0;
@@ -158,9 +164,10 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
         instr->printDecoded();
       }
 
-      if(lineinfo) {
+      if (lineinfo) {
         char *file_name, *dir_name;
-        nvbit_get_line_info(ctx, func, instr->getOffset(), &file_name, &dir_name, &line_num);
+        nvbit_get_line_info(ctx, func, instr->getOffset(), &file_name,
+                            &dir_name, &line_num);
       }
 
       if (opcode_to_id_map.find(instr->getOpcode()) == opcode_to_id_map.end()) {
@@ -180,7 +187,7 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
       int num_mref = 0;
       uint64_t imm_value = 0;
 
-      for(int i = 0; i < instr->getNumOperands(); ++i){
+      for (int i = 0; i < instr->getNumOperands(); ++i) {
         const InstrType::operand_t *op = instr->getOperand(i);
         if (op->type == InstrType::OperandType::MREF) {
           assert(srcNum < MAX_SRC);
@@ -191,13 +198,11 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
           // if(mem_oper_idx == 0){
           //   mem_oper_idx = 1; // loop control
           // }
-        }
-        else if (op->type == InstrType::OperandType::REG){
-          if (i == 0){
+        } else if (op->type == InstrType::OperandType::REG) {
+          if (i == 0) {
             // find dst reg
             dst_oprd = instr->getOperand(0)->u.reg.num;
-          }
-          else {
+          } else {
             // find src regs
             assert(srcNum < MAX_SRC);
             src_oprd[srcNum] = instr->getOperand(i)->u.reg.num;
@@ -210,9 +215,9 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
         }
       }
 
-      do{
+      do {
         /* insert call to the instrumentation function with its
-        * arguments */
+         * arguments */
         nvbit_insert_call(instr, "instrument_inst", IPOINT_BEFORE);
 
         /* pass predicate value */
@@ -226,10 +231,9 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
         if (mem_oper_idx >= 0) {
           nvbit_add_call_arg_const_val32(instr, 1);
           assert(num_mref <= 2);
-          if (num_mref == 2) {  // LDGSTS
-            nvbit_add_call_arg_mref_addr64(instr, 1-mem_oper_idx);
-          }
-          else {
+          if (num_mref == 2) { // LDGSTS
+            nvbit_add_call_arg_mref_addr64(instr, 1 - mem_oper_idx);
+          } else {
             nvbit_add_call_arg_mref_addr64(instr, mem_oper_idx);
           }
           nvbit_add_call_arg_const_val32(instr, (int)instr->getSize());
@@ -251,13 +255,13 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
 
         /* immediate info */
         nvbit_add_call_arg_const_val64(instr, imm_value);
-       
+
         /* add pointer to channel_dev and other counters*/
         nvbit_add_call_arg_const_val64(instr, (uint64_t)&channel_dev);
         nvbit_add_call_arg_const_val64(instr,
-                                      (uint64_t)&total_dynamic_instr_counter);
-        nvbit_add_call_arg_const_val64(instr,
-                                      (uint64_t)&reported_dynamic_instr_counter);
+                                       (uint64_t)&total_dynamic_instr_counter);
+        nvbit_add_call_arg_const_val64(
+            instr, (uint64_t)&reported_dynamic_instr_counter);
         nvbit_add_call_arg_const_val64(instr, (uint64_t)&stop_report);
         /* Add Source code line number for current instr */
         nvbit_add_call_arg_const_val32(instr, (int)line_num);
@@ -310,15 +314,15 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
       }
     }
 
-    if (active_from_start && !dynamic_kernel_limit_start || dynamic_kernel_limit_start == 1)
+    if (active_from_start && !dynamic_kernel_limit_start ||
+        dynamic_kernel_limit_start == 1)
       active_region = true;
     else {
       if (active_from_start)
         active_region = false;
     }
 
-    if(user_defined_folders == 1)
-    {
+    if (user_defined_folders == 1) {
       std::string usr_folder = std::getenv("TRACES_FOLDER");
       std::string temp_traces_location = usr_folder;
       std::string temp_kernelslist_location = usr_folder + "/kernelslist";
@@ -326,9 +330,13 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
       traces_location.resize(temp_traces_location.size());
       kernelslist_location.resize(temp_kernelslist_location.size());
       stats_location.resize(temp_stats_location.size());
-      traces_location.replace(traces_location.begin(), traces_location.end(),temp_traces_location);
-      kernelslist_location.replace(kernelslist_location.begin(), kernelslist_location.end(),temp_kernelslist_location);
-      stats_location.replace(stats_location.begin(), stats_location.end(),temp_stats_location);
+      traces_location.replace(traces_location.begin(), traces_location.end(),
+                              temp_traces_location);
+      kernelslist_location.replace(kernelslist_location.begin(),
+                                   kernelslist_location.end(),
+                                   temp_kernelslist_location);
+      stats_location.replace(stats_location.begin(), stats_location.end(),
+                             temp_stats_location);
       printf("\n Traces location is %s \n", traces_location.c_str());
       printf("Kernelslist location is %s \n", kernelslist_location.c_str());
       printf("Stats location is %s \n", stats_location.c_str());
@@ -359,11 +367,13 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
     cuLaunchKernel_params *p = (cuLaunchKernel_params *)params;
 
     if (!is_exit) {
-      if (active_from_start && dynamic_kernel_limit_start && kernelid == dynamic_kernel_limit_start)
+      if (active_from_start && dynamic_kernel_limit_start &&
+          kernelid == dynamic_kernel_limit_start)
         active_region = true;
 
-      if (terminate_after_limit_number_of_kernels_reached && dynamic_kernel_limit_end != 0 && kernelid > dynamic_kernel_limit_end)
-      {
+      if (terminate_after_limit_number_of_kernels_reached &&
+          dynamic_kernel_limit_end != 0 &&
+          kernelid > dynamic_kernel_limit_end) {
         exit(0);
       }
 
@@ -390,10 +400,11 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
       }
 
       char buffer[1024];
-      sprintf(buffer, std::string(traces_location+"/kernel-%d.trace").c_str(), kernelid);
+      sprintf(buffer, std::string(traces_location + "/kernel-%d.trace").c_str(),
+              kernelid);
 
       if (!stop_report) {
-        if(!xz_compress_trace){
+        if (!xz_compress_trace) {
           resultsFile = fopen(buffer, "w");
           printf("Writing results to %s\n", buffer);
         } else {
@@ -421,18 +432,20 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
                 (uint64_t)nvbit_get_local_mem_base_addr(ctx));
         fprintf(resultsFile, "-nvbit version = %s\n", NVBIT_VERSION);
         fprintf(resultsFile, "-accelsim tracer version = %s\n", TRACER_VERSION);
-        fprintf(resultsFile,  "-enable lineinfo = %d\n", lineinfo);
+        fprintf(resultsFile, "-enable lineinfo = %d\n", lineinfo);
         fprintf(resultsFile, "\n");
 
-        fprintf(resultsFile,
-                "#traces format = [line_num] PC mask dest_num [reg_dests] opcode src_num "
-                "[reg_srcs] mem_width [adrrescompress?] [mem_addresses] immediate\n");
+        fprintf(resultsFile, "#traces format = [line_num] PC mask dest_num "
+                             "[reg_dests] opcode src_num "
+                             "[reg_srcs] mem_width [adrrescompress?] "
+                             "[mem_addresses] immediate\n");
         fprintf(resultsFile, "\n");
       }
 
       kernelsFile = fopen(kernelslist_location.c_str(), "a");
       // This will be a relative path to the traces file
-      sprintf(buffer,"kernel-%d.trace%s", kernelid, xz_compress_trace?".xz":"");
+      sprintf(buffer, "kernel-%d.trace%s", kernelid,
+              xz_compress_trace ? ".xz" : "");
       if (!stop_report) {
         fprintf(kernelsFile, buffer);
         fprintf(kernelsFile, "\n");
@@ -491,22 +504,26 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
       fprintf(statsFile, "\n");
       fclose(statsFile);
 
-      if (!stop_report){
-        if(!xz_compress_trace){fclose(resultsFile);} 
-        else{pclose(resultsFile);}
+      if (!stop_report) {
+        if (!xz_compress_trace) {
+          fclose(resultsFile);
+        } else {
+          pclose(resultsFile);
+        }
       }
 
-      if (active_from_start && dynamic_kernel_limit_end && kernelid > dynamic_kernel_limit_end)
+      if (active_from_start && dynamic_kernel_limit_end &&
+          kernelid > dynamic_kernel_limit_end)
         active_region = false;
     }
   } else if (cbid == API_CUDA_cuProfilerStart && is_exit) {
-      if (!active_from_start) {
-        active_region = true;
-      }
+    if (!active_from_start) {
+      active_region = true;
+    }
   } else if (cbid == API_CUDA_cuProfilerStop && is_exit) {
-      if (!active_from_start) {
-        active_region = false;
-      }
+    if (!active_from_start) {
+      active_region = false;
+    }
   }
 }
 
@@ -623,7 +640,7 @@ void *recv_thread_fun(void *) {
           fprintf(resultsFile, "%d ", ma->sm_id);
           fprintf(resultsFile, "%d ", ma->warpid_sm);
         }
-        if(lineinfo){
+        if (lineinfo) {
           fprintf(resultsFile, "%d ", ma->line_num);
         }
         fprintf(resultsFile, "%04x ", ma->vpc); // Print the virtual PC
diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
index 44ce104d8..dd31c9ebb 100644
--- a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
+++ b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
@@ -1,24 +1,24 @@
+#include <deque>
 #include <fstream>
 #include <iostream>
 #include <math.h>
+#include <memory>
 #include <sstream>
 #include <stdio.h>
 #include <string>
-#include <vector>
-#include <deque>
-#include <memory>
 #include <unordered_map>
+#include <vector>
 
 #include <errno.h>
-#include <unistd.h>
 #include <signal.h>
+#include <unistd.h>
 
 using namespace std;
 
 struct threadblock_info {
   bool initialized;
   unsigned tb_id_x, tb_id_y, tb_id_z;
-  vector<deque<const string*>> warp_insts_array;
+  vector<deque<const string *>> warp_insts_array;
   threadblock_info() {
     initialized = false;
     tb_id_x = tb_id_y = tb_id_z = 0;
@@ -28,21 +28,21 @@ struct threadblock_info {
 /// @brief There exist significant repetition in the trace. The WarpInstLUT
 /// registers recurrent trace fragments in a hash map. Strings (trace fragments)
 /// are mapped to a pointer to a unique copy of that string, which is guaranteed
-/// to live throughout the scope of the lifetime of this WarpInstLUT. 
+/// to live throughout the scope of the lifetime of this WarpInstLUT.
 struct WarpInstLUT {
   // A mapping from "raw instruction string" to "a pointer to a global copy of
-  // that string". For any element (x->y) of this map, *y==x holds. 
+  // that string". For any element (x->y) of this map, *y==x holds.
   unordered_map<string, unique_ptr<string>> registration_table;
 
   /// @brief Is a string already registered?
   /// @param s The probing string.
   /// @return nullptr if the probing string does not exist in the look up table.
-  /// Otherwise, a const pointer to a unique copy of that string. 
-  const string* lookup_entry(const string s) const{
+  /// Otherwise, a const pointer to a unique copy of that string.
+  const string *lookup_entry(const string s) const {
     const auto it = registration_table.find(s);
-    
+
     // not registered
-    if(it == registration_table.end()){
+    if (it == registration_table.end()) {
       return nullptr;
     } else {
       return it->second.get();
@@ -51,22 +51,23 @@ struct WarpInstLUT {
 
   /// @brief Add a string to the look up table.
   /// @param s The string to be added.
-  /// @return A const pointer to the unique copy of the string. 
-  const string* register_new_entry(const string s){
+  /// @return A const pointer to the unique copy of the string.
+  const string *register_new_entry(const string s) {
     // Check if the string is already in the LUT.
     const string *entry_ptr = lookup_entry(s);
-    if(entry_ptr) {
-      // just in case a rare hash collision happens, we panic 
-      if(s != *entry_ptr){
-        cerr  << "FATAL: new string insertion " << s 
-              << "collides with the hash of a different string in the registration table " 
-              << *entry_ptr <<"\n";  
+    if (entry_ptr) {
+      // just in case a rare hash collision happens, we panic
+      if (s != *entry_ptr) {
+        cerr << "FATAL: new string insertion " << s
+             << "collides with the hash of a different string in the "
+                "registration table "
+             << *entry_ptr << "\n";
         abort();
       }
       return entry_ptr;
     }
 
-    // Create a new string 
+    // Create a new string
     auto new_string_ptr = std::make_unique<string>(s);
     entry_ptr = new_string_ptr.get();
     registration_table.insert({s, std::move(new_string_ptr)});
@@ -82,7 +83,7 @@ void group_per_core(const char *filepath);
 // stdin is piped to a process that reads from disk the input trace file. The
 // stdout is piped to a process that writes to disk the post-process trace. We
 // should preserve the original file descriptors for stdin/stdout before doing
-// redirections.  
+// redirections.
 int preserved_stdin_fileno;
 int preserved_stdout_fileno;
 
@@ -135,8 +136,8 @@ int main(int argc, char **argv) {
       group_per_block(filepath.c_str());
 
       int _l = line.length();
-      if(_l > 3 && line.substr(_l - 3, 3) == ".xz"){
-        ofs << line.substr(0, _l-3) << "g.xz" << endl;
+      if (_l > 3 && line.substr(_l - 3, 3) == ".xz") {
+        ofs << line.substr(0, _l - 3) << "g.xz" << endl;
       } else {
         ofs << line + "g" << endl;
       }
@@ -151,73 +152,75 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// This function redirects stdin and stdout for trace processing. 
-// For error/warning/info message to print to the terminal, always use the stderr stream. 
-// The io redirection will be restored by the time the function returns.
+// This function redirects stdin and stdout for trace processing.
+// For error/warning/info message to print to the terminal, always use the
+// stderr stream. The io redirection will be restored by the time the function
+// returns.
 void group_per_block(const char *filepath) {
   preserved_stdin_fileno = dup(STDIN_FILENO);
   preserved_stdout_fileno = dup(STDOUT_FILENO);
 
-  string filepath_str{filepath}; 
+  string filepath_str{filepath};
   WarpInstLUT warp_inst_lut;
 
-  pid_t sink_process_pid=0;
+  pid_t sink_process_pid = 0;
   string trace_sink_cmd;
   int sink_pipe_fd[2];
- 
-  pid_t source_process_pid=0;
+
+  pid_t source_process_pid = 0;
   string trace_source_cmd;
   int source_pipe_fd[2];
   string output_filepath;
 
   bool input_file_is_xz = false;
-  int _l = filepath_str.length(); 
-  if(_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz"){
+  int _l = filepath_str.length();
+  if (_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz") {
     // kernel-1.trace.xz --(xz -dc)--> f --(xz -1 -T0)--> kernel-1.traceg.xz
     input_file_is_xz = true;
     output_filepath = filepath_str.substr(0, _l - 3) + "g.xz";
     trace_source_cmd = "xz -dc " + filepath_str;
     trace_sink_cmd = "xz -1 -T0 > " + output_filepath;
-  } else if(_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace"){
+  } else if (_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace") {
     // kernel-2.trace --(cat)--> f --(cat)--> kernel-2.traceg
     input_file_is_xz = false;
     output_filepath = filepath_str + "g";
     trace_source_cmd = "cat " + filepath_str;
-    trace_sink_cmd = "cat > " + output_filepath;  
+    trace_sink_cmd = "cat > " + output_filepath;
   } else {
-    cerr  << "Only support xz or raw text format. Unable to process - and skipping - trace file "
-          << filepath_str << endl;
+    cerr << "Only support xz or raw text format. Unable to process - and "
+            "skipping - trace file "
+         << filepath_str << endl;
     close(preserved_stdin_fileno);
     close(preserved_stdout_fileno);
     return;
   }
 
-  //cerr << "source cmd is "<<trace_source_cmd<<"\n";
-  //cerr << "sink cmd is "<<trace_sink_cmd<<"\n";
+  // cerr << "source cmd is "<<trace_source_cmd<<"\n";
+  // cerr << "sink cmd is "<<trace_sink_cmd<<"\n";
 
   // fork a child process as the trace source
-  if(pipe(source_pipe_fd)!=0){
+  if (pipe(source_pipe_fd) != 0) {
     cerr << "Failed to create pipe\n";
     perror("pipe");
     exit(1);
-  } 
+  }
   source_process_pid = fork();
-  if(source_process_pid == 0){
+  if (source_process_pid == 0) {
     //  child process
     close(source_pipe_fd[0]);
     dup2(source_pipe_fd[1], STDOUT_FILENO);
-    
+
     // When using GDB, sending Ctrl+C to the program will send a SIGINT signal
     // to the child process as well, subsequently causing it to terminate. To
-    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal. 
+    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
     // Reference:
-    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes 
+    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
     signal(SIGINT, SIG_IGN);
 
     execle("/bin/sh", "sh", "-c", trace_source_cmd.c_str(), NULL, environ);
     perror("execle"); // child shouldn't reach here if all is well.
     exit(1);
-  } else if(source_process_pid > 0) {
+  } else if (source_process_pid > 0) {
     // parent process - the trace post processor
     // stdin is now redirected to the read end of the source_pipe
     close(source_pipe_fd[1]);
@@ -229,21 +232,21 @@ void group_per_block(const char *filepath) {
   }
 
   // fork a child process as the trace sink
-  if(pipe(sink_pipe_fd)!=0){
+  if (pipe(sink_pipe_fd) != 0) {
     cerr << "Failed to create pipe\n";
     perror("pipe");
     exit(1);
   }
   sink_process_pid = fork();
-  if(sink_process_pid == 0){
+  if (sink_process_pid == 0) {
     // child process
     close(sink_pipe_fd[1]);
     dup2(sink_pipe_fd[0], STDIN_FILENO);
-    signal(SIGINT, SIG_IGN); // ignore SIGINT 
+    signal(SIGINT, SIG_IGN); // ignore SIGINT
     execle("/bin/sh", "sh", "-c", trace_sink_cmd.c_str(), NULL, environ);
     perror("execle"); // child shouldn't reach here if all is well.
     exit(1);
-  } else if (sink_process_pid > 0){
+  } else if (sink_process_pid > 0) {
     // parent process - the trace post processor
     // stdout is now redirected to the write end of the sink_pipe
     close(sink_pipe_fd[0]);
@@ -266,10 +269,10 @@ void group_per_block(const char *filepath) {
   bool found_grid_dim = false, found_block_dim = false;
 
   // Add a flag for LDGSTS instruction to indicate which one to remove
-  vector<vector<bool>> ldgsts_flags;  // true to remove, false to not
+  vector<vector<bool>> ldgsts_flags; // true to remove, false to not
 
   // Important... without clear(), cin.eof() may evaluate to true on the second
-  // kernel 
+  // kernel
   cin.clear();
   clearerr(stdin);
   while (!cin.eof()) {
@@ -307,7 +310,8 @@ void group_per_block(const char *filepath) {
               ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32));
 
           // Size the ldgsts_flags vector
-          ldgsts_flags[i].resize(ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32));
+          ldgsts_flags[i].resize(
+              ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32));
           for (unsigned j = 0; j < ldgsts_flags[i].size(); j++) {
             ldgsts_flags[i][j] = true;
           }
@@ -327,10 +331,10 @@ void group_per_block(const char *filepath) {
         insts[tb_id].tb_id_z = tb_id_z;
         insts[tb_id].initialized = true;
       }
-	//ss.ignore(); //remove the space
-	//rest_of_line.clear();
+      // ss.ignore(); //remove the space
+      // rest_of_line.clear();
       // getline(ss, rest_of_line); //get rest of the string!
-	    string rest_of_line(ss.str().substr(ss.tellg()+1));
+      string rest_of_line(ss.str().substr(ss.tellg() + 1));
 
       // Ni: ignore the shmem LDGSTS instruction
       stringstream opcode_ss;
@@ -347,21 +351,21 @@ void group_per_block(const char *filepath) {
       opcode_ss >> opcode;
 
       // Look up the warp inst table to see if this instruction has been
-      // registered. If yes, we just copy the pointer to that string. 
+      // registered. If yes, we just copy the pointer to that string.
       const string *inst_ptr = warp_inst_lut.lookup_entry(rest_of_line);
-      if(!inst_ptr) inst_ptr = warp_inst_lut.register_new_entry(rest_of_line);
+      if (!inst_ptr)
+        inst_ptr = warp_inst_lut.register_new_entry(rest_of_line);
+
+      // One actual LDGSTS instruction includes 2 LDGSTS instructions in the
+      // trace, because it has two memory references. This is trying to remove
+      // the one with the shared memory address.
 
-      // One actual LDGSTS instruction includes 2 LDGSTS instructions in the trace, 
-      // because it has two memory references. 
-      // This is trying to remove the one with the shared memory address.
-      
       if (opcode.find("LDGSTS") != string::npos) {
         if (!ldgsts_flags[tb_id][warpid_tb]) {
           insts[tb_id].warp_insts_array[warpid_tb].push_back(inst_ptr);
         }
         ldgsts_flags[tb_id][warpid_tb] = !ldgsts_flags[tb_id][warpid_tb];
-      }
-      else {
+      } else {
         insts[tb_id].warp_insts_array[warpid_tb].push_back(inst_ptr);
       }
     }
@@ -370,10 +374,12 @@ void group_per_block(const char *filepath) {
   for (unsigned i = 0; i < insts.size(); ++i) {
     // ofs<<string<<"\n";
     if (insts[i].initialized && insts[i].warp_insts_array.size() > 0) {
-      cout << "\n" << "#BEGIN_TB" << "\n";
       cout << "\n"
-          << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y
-          << "," << insts[i].tb_id_z << "\n";
+           << "#BEGIN_TB"
+           << "\n";
+      cout << "\n"
+           << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y
+           << "," << insts[i].tb_id_z << "\n";
     } else {
       cerr << "Warning: Thread block " << insts[i].tb_id_x << ","
            << insts[i].tb_id_y << "," << insts[i].tb_id_z << " is empty"
@@ -381,15 +387,17 @@ void group_per_block(const char *filepath) {
       continue;
     }
     for (unsigned j = 0; j < insts[i].warp_insts_array.size(); ++j) {
-      cout << "\n" << "warp = " << j << "\n";
+      cout << "\n"
+           << "warp = " << j << "\n";
       cout << "insts = " << insts[i].warp_insts_array[j].size() << "\n";
       if (insts[i].warp_insts_array[j].size() == 0) {
         cerr << "Warning: Warp " << j << " in thread block" << insts[i].tb_id_x
              << "," << insts[i].tb_id_y << "," << insts[i].tb_id_z
-             << " is empty" << "\n";
+             << " is empty"
+             << "\n";
       }
-      for (auto it = insts[i].warp_insts_array[j].cbegin(); 
-            it != insts[i].warp_insts_array[j].cend(); ++it) {
+      for (auto it = insts[i].warp_insts_array[j].cbegin();
+           it != insts[i].warp_insts_array[j].cend(); ++it) {
         // dereference once: const string*
         // dereference twice: const string
         cout << **it << "\n";