From 174a78c7489d87c85d638f748d3559b7fef005fb Mon Sep 17 00:00:00 2001 From: Aaron Barnes <42706182+barnes88@users.noreply.github.com> Date: Tue, 9 Jul 2024 16:12:06 -0400 Subject: [PATCH 1/3] Fix stats (#231) * fix shared memory and l1d cache bankconflict stat and add shmem benchmark * this change requires dev branch of app collection --------- Co-authored-by: JRPan <25518778+JRPan@users.noreply.github.com> --- Jenkinsfile | 2 +- util/job_launching/apps/define-all-apps.yml | 7 +++++++ util/job_launching/stats/example_stats.yml | 2 ++ util/plotting/correl_mappings.py | 14 ++++++++++++-- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9d3e9d7e2..da62f8689 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -40,7 +40,7 @@ pipeline { source ./gpu-simulator/setup_environment.sh rm -rf ./gpu-app-collection - git clone git@github.com:accel-sim/gpu-app-collection.git + git clone -b dev git@github.com:accel-sim/gpu-app-collection.git source ./gpu-app-collection/src/setup_environment srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src ./gpu-app-collection/get_regression_data.sh diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index cc3e68d45..76d9585de 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -92,6 +92,13 @@ GPU_Microbenchmark: - shared_lat: - args: accel-sim-mem: 1G + - shared_bank_conflicts: + ## argument 1 kernel has conflicts + - args: 1 + accel-sim-mem: 1G + ## argument 2 kernel doesn't have conflicts + - args: 2 + accel-sim-mem: 1G - MaxFlops: - args: accel-sim-mem: 1G diff --git a/util/job_launching/stats/example_stats.yml b/util/job_launching/stats/example_stats.yml index 5b3a52351..768b0680d 100644 --- a/util/job_launching/stats/example_stats.yml +++ b/util/job_launching/stats/example_stats.yml @@ -24,6 +24,8 @@ collect_aggregate: - 'total dram reads\s*=\s*(.*)' - 'total dram writes\s*=\s*(.*)' - 'kernel_launch_uid\s*=\s*(.*)' + - 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)' + - 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)' # These stats are reset each kernel and should not be diff'd diff --git a/util/plotting/correl_mappings.py b/util/plotting/correl_mappings.py index e474f3d8a..aa2e958e1 100644 --- a/util/plotting/correl_mappings.py +++ b/util/plotting/correl_mappings.py @@ -512,8 +512,18 @@ plottype="log", stattype="counter", ), - CorrelStat( - chart_name="DRAM Reads", + CorrelStat(chart_name="Shared Memory Bank Conflicts", + plotfile="shmem-bank-conflict", + hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\ + + np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])", + hw_error=None, + sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])", + hw_name="all", + drophwnumbelow=0, + plottype="log", + stattype="counter" + ), + CorrelStat(chart_name="DRAM Reads", plotfile="dram-read-transactions", hw_eval='np.average(hw["dram__sectors_read.sum"])', hw_error=None, From 4d79e27299e298903be222f3ee911f75ae2e83b6 Mon Sep 17 00:00:00 2001 From: JRPan <25518778+JRPan@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:11:23 -0700 Subject: [PATCH 2/3] Send email after CI finish (#312) * adding email to CI * ignore on push for merge queue created branches * us dev branch gpu-app-collection * include action link --- .github/workflows/long-tests.yml | 14 +++++++++++--- .github/workflows/short-tests.yml | 10 ++++------ short-tests.sh | 2 ++ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.github/workflows/long-tests.yml b/.github/workflows/long-tests.yml index 7007a81ed..71a0a7109 100644 --- a/.github/workflows/long-tests.yml +++ b/.github/workflows/long-tests.yml @@ -6,7 +6,8 @@ name: Long Tests on: # Triggers the workflow on push or pull request events but only for the mydev branch push: - # pull_request: + branches-ignore: + - "gh-readonly-queue**" merge_group: # Allows you to run this workflow manually from the Actions tab @@ -74,7 +75,14 @@ jobs: ./util/plotting/plot-correlation.py -c ./statistics-archive/ubench/ampere-ubench-sass.csv -H ./hw_run/AMPERE-RTX3070/11.2/ | tee ampere-ubench-correl.txt ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/ rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/ - echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/" + BODY="Github CI - Build $GITHUB_REF SUCCESS. + Action link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + Branch/PR Name: $GITHUB_REF_NAME + Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/." + SUBJECT="Github CI - Build $GITHUB_REF SUCCESS" + TO="${{ secrets.GROUP_EMAIL }}" + echo "$BODY" | mail -s "$SUBJECT" "$TO" + echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/." PTX-Simulation: if: github.repository == 'accel-sim/accel-sim-framework' runs-on: tgrogers-raid @@ -102,7 +110,7 @@ jobs: source ./gpu-simulator/setup_environment.sh rm -rf ./gpu-app-collection - git clone git@github.com:accel-sim/gpu-app-collection.git + git clone -b dev git@github.com:accel-sim/gpu-app-collection.git source ./gpu-app-collection/src/setup_environment srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src ./gpu-app-collection/get_regression_data.sh diff --git a/.github/workflows/short-tests.yml b/.github/workflows/short-tests.yml index c2fbb13fb..1e76120ce 100644 --- a/.github/workflows/short-tests.yml +++ b/.github/workflows/short-tests.yml @@ -6,6 +6,8 @@ name: Short Tests on: # Triggers the workflow on push or pull request events but only for the mydev branch push: + branches-ignore: + - "gh-readonly-queue**" pull_request: # Allows you to run this workflow manually from the Actions tab @@ -27,8 +29,6 @@ jobs: run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh PTX-Simulation: runs-on: ubuntu-latest - container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 # env: # Steps represent a sequence of tasks that will be executed as part of the job @@ -36,11 +36,9 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 - name: Run Simulation - run: echo "skipped SASS-Simulation. Will perform in merge queue" + run: echo "skipped PTX-Simulation. Will perform in merge queue" Tracer-Tool: runs-on: ubuntu-latest - container: - image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7 # env: # Steps represent a sequence of tasks that will be executed as part of the job @@ -48,4 +46,4 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 - name: Run Simulation - run: echo "skipped SASS-Simulation. Will perform in merge queue" \ No newline at end of file + run: echo "skipped Tracer-Simulation. Will perform in merge queue" \ No newline at end of file diff --git a/short-tests.sh b/short-tests.sh index 73e07fde1..c2f357dec 100755 --- a/short-tests.sh +++ b/short-tests.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + if [ ! -n "$CUDA_INSTALL_PATH" ]; then echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH."; exit; From 4a90ea76a0841b755dccb82cf9824256c966a982 Mon Sep 17 00:00:00 2001 From: Aaron Barnes <42706182+barnes88@users.noreply.github.com> Date: Sat, 13 Jul 2024 18:55:45 -0600 Subject: [PATCH 3/3] Auto clang format (#322) * add clang auto-commit * Automated clang-format * use default checkout ref * Format after SASS tests --------- Co-authored-by: barnes88 Co-authored-by: JRPAN <25518778+JRPan@users.noreply.github.com> --- .github/workflows/short-tests.yml | 31 +++- gpu-simulator/ISA_Def/ampere_opcode.h | 2 +- gpu-simulator/trace-driven/trace_driven.cc | 30 ++-- gpu-simulator/trace-parser/trace_parser.cc | 41 ++--- gpu-simulator/trace-parser/trace_parser.h | 4 +- util/tracer_nvbit/tracer_tool/inject_funcs.cu | 15 +- util/tracer_nvbit/tracer_tool/tracer_tool.cu | 127 ++++++++------- .../post-traces-processing.cpp | 148 +++++++++--------- 8 files changed, 226 insertions(+), 172 deletions(-) diff --git a/.github/workflows/short-tests.yml b/.github/workflows/short-tests.yml index 1e76120ce..2a958b2f5 100644 --- a/.github/workflows/short-tests.yml +++ b/.github/workflows/short-tests.yml @@ -14,7 +14,7 @@ on: workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: +jobs: SASS-Simulation: runs-on: ubuntu-latest container: @@ -46,4 +46,31 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 - name: Run Simulation - run: echo "skipped Tracer-Simulation. Will perform in merge queue" \ No newline at end of file + run: echo "skipped Tracer-Simulation. Will perform in merge queue" + format-code: + runs-on: ubuntu-latest + needs: [SASS-Simulation, PTX-Simulation, Tracer-Tool] + + permissions: + # Give the default GITHUB_TOKEN write permission to commit and push the + # added or changed files to the repository. + contents: write + + steps: + - uses: actions/checkout@v4 + # Other steps that change files in the repository go here + # … + - name: Run clang-format + run: | + sudo apt-get install -y clang-format + ./gpu-simulator/format-code.sh + ./util/tracer_nvbit/tracer_tool/format-code.sh + + - uses: stefanzweifel/git-auto-commit-action@v5 + with: + # Optional. Commit message for the created commit. + # Defaults to "Apply automatic changes" + commit_message: Automated clang-format + # Optional. Option used by `git-status` to determine if the repository is + # dirty. See https://git-scm.com/docs/git-status#_options + status_options: '--untracked-files=no' \ No newline at end of file diff --git a/gpu-simulator/ISA_Def/ampere_opcode.h b/gpu-simulator/ISA_Def/ampere_opcode.h index 8b0cabd5b..afc997df1 100644 --- a/gpu-simulator/ISA_Def/ampere_opcode.h +++ b/gpu-simulator/ISA_Def/ampere_opcode.h @@ -136,7 +136,7 @@ static const std::unordered_map Ampere_OpcodeMap = { {"CCTLT", OpcodeChar(OP_CCTLT, ALU_OP)}, {"LDGDEPBAR", OpcodeChar(OP_LDGDEPBAR, ALU_OP)}, - {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)}, + {"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)}, // Uniform Datapath Instruction // UDP unit diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc index 08d121af0..ab509d21f 100644 --- a/gpu-simulator/trace-driven/trace_driven.cc +++ b/gpu-simulator/trace-driven/trace_driven.cc @@ -259,10 +259,10 @@ bool trace_warp_inst_t::parse_from_trace_struct( cache_op = CACHE_ALL; break; case OP_LDG: - // LDGSTS is loading the values needed directly from the global memory to shared memory. - // Before this feature, the values need to be loaded to registers first, then store to - // the shared memory. - case OP_LDGSTS: // Add for memcpy_async + // LDGSTS is loading the values needed directly from the global memory to + // shared memory. Before this feature, the values need to be loaded to + // registers first, then store to the shared memory. + case OP_LDGSTS: // Add for memcpy_async case OP_LDL: assert(data_size > 0); memory_op = memory_load; @@ -272,8 +272,7 @@ bool trace_warp_inst_t::parse_from_trace_struct( else space.set_type(global_space); // Add for LDGSTS instruction - if (m_opcode == OP_LDGSTS) - m_is_ldgsts = true; + if (m_opcode == OP_LDGSTS) m_is_ldgsts = true; // check the cache scope, if its strong GPU, then bypass L1 if (trace.check_opcode_contain(opcode_tokens, "STRONG") && trace.check_opcode_contain(opcode_tokens, "GPU")) { @@ -369,19 +368,18 @@ bool trace_warp_inst_t::parse_from_trace_struct( // barrier_type bar_type; // reduction_type red_type; break; - // LDGDEPBAR is to form a group containing the previous LDGSTS instructions that - // have not been grouped yet. - // In the implementation, a group number will be assigned once the instruction is - // met. + // LDGDEPBAR is to form a group containing the previous LDGSTS instructions + // that have not been grouped yet. In the implementation, a group number + // will be assigned once the instruction is met. case OP_LDGDEPBAR: m_is_ldgdepbar = true; break; - // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS - // instructions. It is associated with a immediate value. The immediate value - // indicates the last N LDGDEPBAR groups to not wait once the instruction is met. - // For example, if the immediate value is 1, then the last group is able to proceed - // even with DEPBAR present; if the immediate value is 0, then all of the groups - // need to finish before proceed. + // DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS + // instructions. It is associated with a immediate value. The immediate + // value indicates the last N LDGDEPBAR groups to not wait once the + // instruction is met. For example, if the immediate value is 1, then the + // last group is able to proceed even with DEPBAR present; if the immediate + // value is 0, then all of the groups need to finish before proceed. case OP_DEPBAR: m_is_depbar = true; m_depbar_group_no = trace.imm; diff --git a/gpu-simulator/trace-parser/trace_parser.cc b/gpu-simulator/trace-parser/trace_parser.cc index 68cfb235d..56dda28d4 100644 --- a/gpu-simulator/trace-parser/trace_parser.cc +++ b/gpu-simulator/trace-parser/trace_parser.cc @@ -12,8 +12,8 @@ #include #include -#include #include +#include #include "trace_parser.h" @@ -32,7 +32,10 @@ void split(const std::string &str, std::vector &cont, } } -inst_trace_t::inst_trace_t() { memadd_info = NULL; imm = 0;} +inst_trace_t::inst_trace_t() { + memadd_info = NULL; + imm = 0; +} inst_trace_t::~inst_trace_t() { if (memadd_info != NULL) delete memadd_info; @@ -169,7 +172,7 @@ bool inst_trace_t::parse_from_string(std::string trace, unsigned trace_version, ss >> temp; sscanf(temp.c_str(), "R%d", ®_src[i]); } - + // parse mem info unsigned address_mode = 0; unsigned mem_width = 0; @@ -287,30 +290,30 @@ kernel_trace_t *trace_parser::parse_kernel_info( std::string read_trace_cmd; int _l = kerneltraces_filepath.length(); - if(_l > 3 && kerneltraces_filepath.substr(_l-3, 3) == ".xz"){ + if (_l > 3 && kerneltraces_filepath.substr(_l - 3, 3) == ".xz") { // this is xz-compressed trace read_trace_cmd = "xz -dc " + kerneltraces_filepath; - } else if(_l > 7 && kerneltraces_filepath.substr(_l-7, 7) == ".traceg"){ + } else if (_l > 7 && kerneltraces_filepath.substr(_l - 7, 7) == ".traceg") { // this is plain text trace - read_trace_cmd ="cat " + kerneltraces_filepath; + read_trace_cmd = "cat " + kerneltraces_filepath; } else { - std::cerr << "Can't read trace. Only .xz and plain text are supported: " - << kerneltraces_filepath <<"\n"; + std::cerr << "Can't read trace. Only .xz and plain text are supported: " + << kerneltraces_filepath << "\n"; exit(1); } // Create an interprocess channel, and fork out a data source process. The // data source process reads trace from disk, write to the channel, and the - // simulator process read from the channel. + // simulator process read from the channel. int *pipefd = kernel_info->pipefd; - if(pipe(pipefd) != 0){ + if (pipe(pipefd) != 0) { std::cerr << "Failed to create interprocess channel\n"; perror("pipe"); exit(1); } pid_t pid = fork(); - if(pid == 0){ + if (pid == 0) { // The child process is the data source. Redirect its // stdout to the write end of the pipe. close(pipefd[0]); @@ -318,29 +321,29 @@ kernel_trace_t *trace_parser::parse_kernel_info( // When using GDB, sending Ctrl+C to the simulator will send a SIGINT signal // to the child process as well, subsequently causing it to terminate. To - // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal. + // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal. // Reference: - // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes + // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes signal(SIGINT, SIG_IGN); execle("/bin/sh", "sh", "-c", read_trace_cmd.c_str(), NULL, environ); - perror("execle"); // the child process shouldn't reach here if all is well. + perror("execle"); // the child process shouldn't reach here if all is well. exit(1); } else { // parent (simulator) close(pipefd[1]); dup2(pipefd[0], STDIN_FILENO); } - - // Parent continues from here. - kernel_info->ifs = &std::cin; + + // Parent continues from here. + kernel_info->ifs = &std::cin; std::istream *ifs = kernel_info->ifs; std::cout << "Processing kernel " << kerneltraces_filepath << std::endl; std::string line; - // Important to clear the istream. Otherwise, the eofbit from the last + // Important to clear the istream. Otherwise, the eofbit from the last // kernel may be carried over to this kernel ifs->clear(); clearerr(stdin); @@ -417,7 +420,7 @@ void trace_parser::kernel_finalizer(kernel_trace_t *trace_info) { // The pipe read/write end file descriptors held by the child process would // have been automatically closed when it terminated. But the parent // process may read an arbitrary amount of trace files, so it has to close - // all file descriptors. + // all file descriptors. close(trace_info->pipefd[0]); close(trace_info->pipefd[1]); delete trace_info; diff --git a/gpu-simulator/trace-parser/trace_parser.h b/gpu-simulator/trace-parser/trace_parser.h index 4882380c6..946cf8d3d 100644 --- a/gpu-simulator/trace-parser/trace_parser.h +++ b/gpu-simulator/trace-parser/trace_parser.h @@ -99,12 +99,12 @@ struct kernel_trace_t { std::istream *ifs; // Anonymous pipe through which the trace is transmitted from a trace reader // process to the simulator process - int pipefd[2]={}; + int pipefd[2] = {}; }; class trace_parser { public: - trace_parser(){} + trace_parser() {} trace_parser(const char *kernellist_filepath); std::vector parse_commandlist_file(); diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu index aaaf4b0b9..0bf834033 100644 --- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu +++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu @@ -17,13 +17,14 @@ * extern "C" __device__ __noinline__ * To prevent "dead"-code elimination by the compiler. */ -extern "C" __device__ __noinline__ void instrument_inst( - int pred, int opcode_id, int32_t vpc, bool is_mem, uint64_t addr, - int32_t width, int32_t desReg, int32_t srcReg1, int32_t srcReg2, - int32_t srcReg3, int32_t srcReg4, int32_t srcReg5, int32_t srcNum, - uint64_t immediate, - uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter, - uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, uint32_t line_num) { +extern "C" __device__ __noinline__ void +instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem, + uint64_t addr, int32_t width, int32_t desReg, int32_t srcReg1, + int32_t srcReg2, int32_t srcReg3, int32_t srcReg4, + int32_t srcReg5, int32_t srcNum, uint64_t immediate, + uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter, + uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, + uint32_t line_num) { const int active_mask = __ballot_sync(__activemask(), 1); const int predicate_mask = __ballot_sync(__activemask(), pred); diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu index 2b2f78d88..4123b7221 100644 --- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu +++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu @@ -67,7 +67,7 @@ int xz_compress_trace = 0; std::map opcode_to_id_map; std::map id_to_opcode_map; -std::string cwd = getcwd(NULL,0); +std::string cwd = getcwd(NULL, 0); std::string traces_location = cwd + "/traces/"; std::string kernelslist_location = cwd + "/traces/kernelslist"; std::string stats_location = cwd + "/traces/stats.csv"; @@ -90,25 +90,30 @@ void nvbit_at_init() { "Exclude predicated off instruction from count"); GET_VAR_INT(lineinfo, "TRACE_LINEINFO", 0, "Include source code line info at the start of each traced line. " - "The target binary must be compiled with -lineinfo or --generate-line-info"); + "The target binary must be compiled with -lineinfo or " + "--generate-line-info"); GET_VAR_INT(dynamic_kernel_limit_end, "DYNAMIC_KERNEL_LIMIT_END", 0, "Limit of the number kernel to be printed, 0 means no limit"); GET_VAR_INT(dynamic_kernel_limit_start, "DYNAMIC_KERNEL_LIMIT_START", 0, "start to report kernel from this kernel id, 0 means starts from " "the beginning, i.e. first kernel"); GET_VAR_INT( - active_from_start, "ACTIVE_FROM_START", 1, - "Start instruction tracing from start or wait for cuProfilerStart " - "and cuProfilerStop. If set to 0, DYNAMIC_KERNEL_LIMIT options have no effect"); + active_from_start, "ACTIVE_FROM_START", 1, + "Start instruction tracing from start or wait for cuProfilerStart " + "and cuProfilerStop. If set to 0, DYNAMIC_KERNEL_LIMIT options have no " + "effect"); GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool"); GET_VAR_INT(enable_compress, "TOOL_COMPRESS", 1, "Enable traces compression"); GET_VAR_INT(print_core_id, "TOOL_TRACE_CORE", 0, "write the core id in the traces"); - GET_VAR_INT(terminate_after_limit_number_of_kernels_reached, "TERMINATE_UPON_LIMIT", 0, - "Stop the process once the current kernel > DYNAMIC_KERNEL_LIMIT_END"); - GET_VAR_INT(user_defined_folders, "USER_DEFINED_FOLDERS", 0, "Uses the user defined " + GET_VAR_INT( + terminate_after_limit_number_of_kernels_reached, "TERMINATE_UPON_LIMIT", + 0, "Stop the process once the current kernel > DYNAMIC_KERNEL_LIMIT_END"); + GET_VAR_INT(user_defined_folders, "USER_DEFINED_FOLDERS", 0, + "Uses the user defined " "folder TRACES_FOLDER path environment"); - GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 0, "Create xz-compressed trace" + GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 0, + "Create xz-compressed trace" "file"); std::string pad(100, '-'); printf("%s\n", pad.c_str()); @@ -141,7 +146,8 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { const std::vector &instrs = nvbit_get_instrs(ctx, f); if (verbose) { - printf("Inspecting function %s at address 0x%lx\n", nvbit_get_func_name(ctx, f), nvbit_get_func_addr(f), true); + printf("Inspecting function %s at address 0x%lx\n", + nvbit_get_func_name(ctx, f), nvbit_get_func_addr(f), true); } uint32_t cnt = 0; @@ -158,9 +164,10 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { instr->printDecoded(); } - if(lineinfo) { + if (lineinfo) { char *file_name, *dir_name; - nvbit_get_line_info(ctx, func, instr->getOffset(), &file_name, &dir_name, &line_num); + nvbit_get_line_info(ctx, func, instr->getOffset(), &file_name, + &dir_name, &line_num); } if (opcode_to_id_map.find(instr->getOpcode()) == opcode_to_id_map.end()) { @@ -180,7 +187,7 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { int num_mref = 0; uint64_t imm_value = 0; - for(int i = 0; i < instr->getNumOperands(); ++i){ + for (int i = 0; i < instr->getNumOperands(); ++i) { const InstrType::operand_t *op = instr->getOperand(i); if (op->type == InstrType::OperandType::MREF) { assert(srcNum < MAX_SRC); @@ -191,13 +198,11 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { // if(mem_oper_idx == 0){ // mem_oper_idx = 1; // loop control // } - } - else if (op->type == InstrType::OperandType::REG){ - if (i == 0){ + } else if (op->type == InstrType::OperandType::REG) { + if (i == 0) { // find dst reg dst_oprd = instr->getOperand(0)->u.reg.num; - } - else { + } else { // find src regs assert(srcNum < MAX_SRC); src_oprd[srcNum] = instr->getOperand(i)->u.reg.num; @@ -210,9 +215,9 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { } } - do{ + do { /* insert call to the instrumentation function with its - * arguments */ + * arguments */ nvbit_insert_call(instr, "instrument_inst", IPOINT_BEFORE); /* pass predicate value */ @@ -226,10 +231,9 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { if (mem_oper_idx >= 0) { nvbit_add_call_arg_const_val32(instr, 1); assert(num_mref <= 2); - if (num_mref == 2) { // LDGSTS - nvbit_add_call_arg_mref_addr64(instr, 1-mem_oper_idx); - } - else { + if (num_mref == 2) { // LDGSTS + nvbit_add_call_arg_mref_addr64(instr, 1 - mem_oper_idx); + } else { nvbit_add_call_arg_mref_addr64(instr, mem_oper_idx); } nvbit_add_call_arg_const_val32(instr, (int)instr->getSize()); @@ -251,13 +255,13 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { /* immediate info */ nvbit_add_call_arg_const_val64(instr, imm_value); - + /* add pointer to channel_dev and other counters*/ nvbit_add_call_arg_const_val64(instr, (uint64_t)&channel_dev); nvbit_add_call_arg_const_val64(instr, - (uint64_t)&total_dynamic_instr_counter); - nvbit_add_call_arg_const_val64(instr, - (uint64_t)&reported_dynamic_instr_counter); + (uint64_t)&total_dynamic_instr_counter); + nvbit_add_call_arg_const_val64( + instr, (uint64_t)&reported_dynamic_instr_counter); nvbit_add_call_arg_const_val64(instr, (uint64_t)&stop_report); /* Add Source code line number for current instr */ nvbit_add_call_arg_const_val32(instr, (int)line_num); @@ -310,15 +314,15 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, } } - if (active_from_start && !dynamic_kernel_limit_start || dynamic_kernel_limit_start == 1) + if (active_from_start && !dynamic_kernel_limit_start || + dynamic_kernel_limit_start == 1) active_region = true; else { if (active_from_start) active_region = false; } - if(user_defined_folders == 1) - { + if (user_defined_folders == 1) { std::string usr_folder = std::getenv("TRACES_FOLDER"); std::string temp_traces_location = usr_folder; std::string temp_kernelslist_location = usr_folder + "/kernelslist"; @@ -326,9 +330,13 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, traces_location.resize(temp_traces_location.size()); kernelslist_location.resize(temp_kernelslist_location.size()); stats_location.resize(temp_stats_location.size()); - traces_location.replace(traces_location.begin(), traces_location.end(),temp_traces_location); - kernelslist_location.replace(kernelslist_location.begin(), kernelslist_location.end(),temp_kernelslist_location); - stats_location.replace(stats_location.begin(), stats_location.end(),temp_stats_location); + traces_location.replace(traces_location.begin(), traces_location.end(), + temp_traces_location); + kernelslist_location.replace(kernelslist_location.begin(), + kernelslist_location.end(), + temp_kernelslist_location); + stats_location.replace(stats_location.begin(), stats_location.end(), + temp_stats_location); printf("\n Traces location is %s \n", traces_location.c_str()); printf("Kernelslist location is %s \n", kernelslist_location.c_str()); printf("Stats location is %s \n", stats_location.c_str()); @@ -359,11 +367,13 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, cuLaunchKernel_params *p = (cuLaunchKernel_params *)params; if (!is_exit) { - if (active_from_start && dynamic_kernel_limit_start && kernelid == dynamic_kernel_limit_start) + if (active_from_start && dynamic_kernel_limit_start && + kernelid == dynamic_kernel_limit_start) active_region = true; - if (terminate_after_limit_number_of_kernels_reached && dynamic_kernel_limit_end != 0 && kernelid > dynamic_kernel_limit_end) - { + if (terminate_after_limit_number_of_kernels_reached && + dynamic_kernel_limit_end != 0 && + kernelid > dynamic_kernel_limit_end) { exit(0); } @@ -390,10 +400,11 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, } char buffer[1024]; - sprintf(buffer, std::string(traces_location+"/kernel-%d.trace").c_str(), kernelid); + sprintf(buffer, std::string(traces_location + "/kernel-%d.trace").c_str(), + kernelid); if (!stop_report) { - if(!xz_compress_trace){ + if (!xz_compress_trace) { resultsFile = fopen(buffer, "w"); printf("Writing results to %s\n", buffer); } else { @@ -421,18 +432,20 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, (uint64_t)nvbit_get_local_mem_base_addr(ctx)); fprintf(resultsFile, "-nvbit version = %s\n", NVBIT_VERSION); fprintf(resultsFile, "-accelsim tracer version = %s\n", TRACER_VERSION); - fprintf(resultsFile, "-enable lineinfo = %d\n", lineinfo); + fprintf(resultsFile, "-enable lineinfo = %d\n", lineinfo); fprintf(resultsFile, "\n"); - fprintf(resultsFile, - "#traces format = [line_num] PC mask dest_num [reg_dests] opcode src_num " - "[reg_srcs] mem_width [adrrescompress?] [mem_addresses] immediate\n"); + fprintf(resultsFile, "#traces format = [line_num] PC mask dest_num " + "[reg_dests] opcode src_num " + "[reg_srcs] mem_width [adrrescompress?] " + "[mem_addresses] immediate\n"); fprintf(resultsFile, "\n"); } kernelsFile = fopen(kernelslist_location.c_str(), "a"); // This will be a relative path to the traces file - sprintf(buffer,"kernel-%d.trace%s", kernelid, xz_compress_trace?".xz":""); + sprintf(buffer, "kernel-%d.trace%s", kernelid, + xz_compress_trace ? ".xz" : ""); if (!stop_report) { fprintf(kernelsFile, buffer); fprintf(kernelsFile, "\n"); @@ -491,22 +504,26 @@ void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, fprintf(statsFile, "\n"); fclose(statsFile); - if (!stop_report){ - if(!xz_compress_trace){fclose(resultsFile);} - else{pclose(resultsFile);} + if (!stop_report) { + if (!xz_compress_trace) { + fclose(resultsFile); + } else { + pclose(resultsFile); + } } - if (active_from_start && dynamic_kernel_limit_end && kernelid > dynamic_kernel_limit_end) + if (active_from_start && dynamic_kernel_limit_end && + kernelid > dynamic_kernel_limit_end) active_region = false; } } else if (cbid == API_CUDA_cuProfilerStart && is_exit) { - if (!active_from_start) { - active_region = true; - } + if (!active_from_start) { + active_region = true; + } } else if (cbid == API_CUDA_cuProfilerStop && is_exit) { - if (!active_from_start) { - active_region = false; - } + if (!active_from_start) { + active_region = false; + } } } @@ -623,7 +640,7 @@ void *recv_thread_fun(void *) { fprintf(resultsFile, "%d ", ma->sm_id); fprintf(resultsFile, "%d ", ma->warpid_sm); } - if(lineinfo){ + if (lineinfo) { fprintf(resultsFile, "%d ", ma->line_num); } fprintf(resultsFile, "%04x ", ma->vpc); // Print the virtual PC diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp index 44ce104d8..dd31c9ebb 100644 --- a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp +++ b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp @@ -1,24 +1,24 @@ +#include #include #include #include +#include #include #include #include -#include -#include -#include #include +#include #include -#include #include +#include using namespace std; struct threadblock_info { bool initialized; unsigned tb_id_x, tb_id_y, tb_id_z; - vector> warp_insts_array; + vector> warp_insts_array; threadblock_info() { initialized = false; tb_id_x = tb_id_y = tb_id_z = 0; @@ -28,21 +28,21 @@ struct threadblock_info { /// @brief There exist significant repetition in the trace. The WarpInstLUT /// registers recurrent trace fragments in a hash map. Strings (trace fragments) /// are mapped to a pointer to a unique copy of that string, which is guaranteed -/// to live throughout the scope of the lifetime of this WarpInstLUT. +/// to live throughout the scope of the lifetime of this WarpInstLUT. struct WarpInstLUT { // A mapping from "raw instruction string" to "a pointer to a global copy of - // that string". For any element (x->y) of this map, *y==x holds. + // that string". For any element (x->y) of this map, *y==x holds. unordered_map> registration_table; /// @brief Is a string already registered? /// @param s The probing string. /// @return nullptr if the probing string does not exist in the look up table. - /// Otherwise, a const pointer to a unique copy of that string. - const string* lookup_entry(const string s) const{ + /// Otherwise, a const pointer to a unique copy of that string. + const string *lookup_entry(const string s) const { const auto it = registration_table.find(s); - + // not registered - if(it == registration_table.end()){ + if (it == registration_table.end()) { return nullptr; } else { return it->second.get(); @@ -51,22 +51,23 @@ struct WarpInstLUT { /// @brief Add a string to the look up table. /// @param s The string to be added. - /// @return A const pointer to the unique copy of the string. - const string* register_new_entry(const string s){ + /// @return A const pointer to the unique copy of the string. + const string *register_new_entry(const string s) { // Check if the string is already in the LUT. const string *entry_ptr = lookup_entry(s); - if(entry_ptr) { - // just in case a rare hash collision happens, we panic - if(s != *entry_ptr){ - cerr << "FATAL: new string insertion " << s - << "collides with the hash of a different string in the registration table " - << *entry_ptr <<"\n"; + if (entry_ptr) { + // just in case a rare hash collision happens, we panic + if (s != *entry_ptr) { + cerr << "FATAL: new string insertion " << s + << "collides with the hash of a different string in the " + "registration table " + << *entry_ptr << "\n"; abort(); } return entry_ptr; } - // Create a new string + // Create a new string auto new_string_ptr = std::make_unique(s); entry_ptr = new_string_ptr.get(); registration_table.insert({s, std::move(new_string_ptr)}); @@ -82,7 +83,7 @@ void group_per_core(const char *filepath); // stdin is piped to a process that reads from disk the input trace file. The // stdout is piped to a process that writes to disk the post-process trace. We // should preserve the original file descriptors for stdin/stdout before doing -// redirections. +// redirections. int preserved_stdin_fileno; int preserved_stdout_fileno; @@ -135,8 +136,8 @@ int main(int argc, char **argv) { group_per_block(filepath.c_str()); int _l = line.length(); - if(_l > 3 && line.substr(_l - 3, 3) == ".xz"){ - ofs << line.substr(0, _l-3) << "g.xz" << endl; + if (_l > 3 && line.substr(_l - 3, 3) == ".xz") { + ofs << line.substr(0, _l - 3) << "g.xz" << endl; } else { ofs << line + "g" << endl; } @@ -151,73 +152,75 @@ int main(int argc, char **argv) { return 0; } -// This function redirects stdin and stdout for trace processing. -// For error/warning/info message to print to the terminal, always use the stderr stream. -// The io redirection will be restored by the time the function returns. +// This function redirects stdin and stdout for trace processing. +// For error/warning/info message to print to the terminal, always use the +// stderr stream. The io redirection will be restored by the time the function +// returns. void group_per_block(const char *filepath) { preserved_stdin_fileno = dup(STDIN_FILENO); preserved_stdout_fileno = dup(STDOUT_FILENO); - string filepath_str{filepath}; + string filepath_str{filepath}; WarpInstLUT warp_inst_lut; - pid_t sink_process_pid=0; + pid_t sink_process_pid = 0; string trace_sink_cmd; int sink_pipe_fd[2]; - - pid_t source_process_pid=0; + + pid_t source_process_pid = 0; string trace_source_cmd; int source_pipe_fd[2]; string output_filepath; bool input_file_is_xz = false; - int _l = filepath_str.length(); - if(_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz"){ + int _l = filepath_str.length(); + if (_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz") { // kernel-1.trace.xz --(xz -dc)--> f --(xz -1 -T0)--> kernel-1.traceg.xz input_file_is_xz = true; output_filepath = filepath_str.substr(0, _l - 3) + "g.xz"; trace_source_cmd = "xz -dc " + filepath_str; trace_sink_cmd = "xz -1 -T0 > " + output_filepath; - } else if(_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace"){ + } else if (_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace") { // kernel-2.trace --(cat)--> f --(cat)--> kernel-2.traceg input_file_is_xz = false; output_filepath = filepath_str + "g"; trace_source_cmd = "cat " + filepath_str; - trace_sink_cmd = "cat > " + output_filepath; + trace_sink_cmd = "cat > " + output_filepath; } else { - cerr << "Only support xz or raw text format. Unable to process - and skipping - trace file " - << filepath_str << endl; + cerr << "Only support xz or raw text format. Unable to process - and " + "skipping - trace file " + << filepath_str << endl; close(preserved_stdin_fileno); close(preserved_stdout_fileno); return; } - //cerr << "source cmd is "< 0) { + } else if (source_process_pid > 0) { // parent process - the trace post processor // stdin is now redirected to the read end of the source_pipe close(source_pipe_fd[1]); @@ -229,21 +232,21 @@ void group_per_block(const char *filepath) { } // fork a child process as the trace sink - if(pipe(sink_pipe_fd)!=0){ + if (pipe(sink_pipe_fd) != 0) { cerr << "Failed to create pipe\n"; perror("pipe"); exit(1); } sink_process_pid = fork(); - if(sink_process_pid == 0){ + if (sink_process_pid == 0) { // child process close(sink_pipe_fd[1]); dup2(sink_pipe_fd[0], STDIN_FILENO); - signal(SIGINT, SIG_IGN); // ignore SIGINT + signal(SIGINT, SIG_IGN); // ignore SIGINT execle("/bin/sh", "sh", "-c", trace_sink_cmd.c_str(), NULL, environ); perror("execle"); // child shouldn't reach here if all is well. exit(1); - } else if (sink_process_pid > 0){ + } else if (sink_process_pid > 0) { // parent process - the trace post processor // stdout is now redirected to the write end of the sink_pipe close(sink_pipe_fd[0]); @@ -266,10 +269,10 @@ void group_per_block(const char *filepath) { bool found_grid_dim = false, found_block_dim = false; // Add a flag for LDGSTS instruction to indicate which one to remove - vector> ldgsts_flags; // true to remove, false to not + vector> ldgsts_flags; // true to remove, false to not // Important... without clear(), cin.eof() may evaluate to true on the second - // kernel + // kernel cin.clear(); clearerr(stdin); while (!cin.eof()) { @@ -307,7 +310,8 @@ void group_per_block(const char *filepath) { ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32)); // Size the ldgsts_flags vector - ldgsts_flags[i].resize(ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32)); + ldgsts_flags[i].resize( + ceil(float(tb_dim_x * tb_dim_y * tb_dim_z) / 32)); for (unsigned j = 0; j < ldgsts_flags[i].size(); j++) { ldgsts_flags[i][j] = true; } @@ -327,10 +331,10 @@ void group_per_block(const char *filepath) { insts[tb_id].tb_id_z = tb_id_z; insts[tb_id].initialized = true; } - //ss.ignore(); //remove the space - //rest_of_line.clear(); + // ss.ignore(); //remove the space + // rest_of_line.clear(); // getline(ss, rest_of_line); //get rest of the string! - string rest_of_line(ss.str().substr(ss.tellg()+1)); + string rest_of_line(ss.str().substr(ss.tellg() + 1)); // Ni: ignore the shmem LDGSTS instruction stringstream opcode_ss; @@ -347,21 +351,21 @@ void group_per_block(const char *filepath) { opcode_ss >> opcode; // Look up the warp inst table to see if this instruction has been - // registered. If yes, we just copy the pointer to that string. + // registered. If yes, we just copy the pointer to that string. const string *inst_ptr = warp_inst_lut.lookup_entry(rest_of_line); - if(!inst_ptr) inst_ptr = warp_inst_lut.register_new_entry(rest_of_line); + if (!inst_ptr) + inst_ptr = warp_inst_lut.register_new_entry(rest_of_line); + + // One actual LDGSTS instruction includes 2 LDGSTS instructions in the + // trace, because it has two memory references. This is trying to remove + // the one with the shared memory address. - // One actual LDGSTS instruction includes 2 LDGSTS instructions in the trace, - // because it has two memory references. - // This is trying to remove the one with the shared memory address. - if (opcode.find("LDGSTS") != string::npos) { if (!ldgsts_flags[tb_id][warpid_tb]) { insts[tb_id].warp_insts_array[warpid_tb].push_back(inst_ptr); } ldgsts_flags[tb_id][warpid_tb] = !ldgsts_flags[tb_id][warpid_tb]; - } - else { + } else { insts[tb_id].warp_insts_array[warpid_tb].push_back(inst_ptr); } } @@ -370,10 +374,12 @@ void group_per_block(const char *filepath) { for (unsigned i = 0; i < insts.size(); ++i) { // ofs< 0) { - cout << "\n" << "#BEGIN_TB" << "\n"; cout << "\n" - << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y - << "," << insts[i].tb_id_z << "\n"; + << "#BEGIN_TB" + << "\n"; + cout << "\n" + << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y + << "," << insts[i].tb_id_z << "\n"; } else { cerr << "Warning: Thread block " << insts[i].tb_id_x << "," << insts[i].tb_id_y << "," << insts[i].tb_id_z << " is empty" @@ -381,15 +387,17 @@ void group_per_block(const char *filepath) { continue; } for (unsigned j = 0; j < insts[i].warp_insts_array.size(); ++j) { - cout << "\n" << "warp = " << j << "\n"; + cout << "\n" + << "warp = " << j << "\n"; cout << "insts = " << insts[i].warp_insts_array[j].size() << "\n"; if (insts[i].warp_insts_array[j].size() == 0) { cerr << "Warning: Warp " << j << " in thread block" << insts[i].tb_id_x << "," << insts[i].tb_id_y << "," << insts[i].tb_id_z - << " is empty" << "\n"; + << " is empty" + << "\n"; } - for (auto it = insts[i].warp_insts_array[j].cbegin(); - it != insts[i].warp_insts_array[j].cend(); ++it) { + for (auto it = insts[i].warp_insts_array[j].cbegin(); + it != insts[i].warp_insts_array[j].cend(); ++it) { // dereference once: const string* // dereference twice: const string cout << **it << "\n";