Skip to content

Commit

Permalink
Merge branch 'dev' into sim-name
Browse files Browse the repository at this point in the history
  • Loading branch information
JRPan authored Aug 8, 2024
2 parents 953b8a8 + 4a90ea7 commit 7ef7f01
Show file tree
Hide file tree
Showing 14 changed files with 264 additions and 183 deletions.
14 changes: 11 additions & 3 deletions .github/workflows/long-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ name: Long Tests
on:
# Triggers the workflow on push or pull request events but only for the mydev branch
push:
# pull_request:
branches-ignore:
- "gh-readonly-queue**"
merge_group:

# Allows you to run this workflow manually from the Actions tab
Expand Down Expand Up @@ -74,7 +75,14 @@ jobs:
./util/plotting/plot-correlation.py -c ./statistics-archive/ubench/ampere-ubench-sass.csv -H ./hw_run/AMPERE-RTX3070/11.2/ | tee ampere-ubench-correl.txt
ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/
echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/"
BODY="Github CI - Build $GITHUB_REF SUCCESS.
Action link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
Branch/PR Name: $GITHUB_REF_NAME
Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
SUBJECT="Github CI - Build $GITHUB_REF SUCCESS"
TO="${{ secrets.GROUP_EMAIL }}"
echo "$BODY" | mail -s "$SUBJECT" "$TO"
echo "Correlation Report at: https://tgrogers-pc01.ecn.purdue.edu/github-ci/accel-sim/correl/git_${GITHUB_REF}"_"$GITHUB_RUN_NUMBER"_"$GITHUB_RUN_ATTEMPT/."
PTX-Simulation:
if: github.repository == 'accel-sim/accel-sim-framework'
runs-on: tgrogers-raid
Expand Down Expand Up @@ -102,7 +110,7 @@ jobs:
source ./gpu-simulator/setup_environment.sh
rm -rf ./gpu-app-collection
git clone [email protected]:accel-sim/gpu-app-collection.git
git clone -b dev [email protected]:accel-sim/gpu-app-collection.git
source ./gpu-app-collection/src/setup_environment
srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
./gpu-app-collection/get_regression_data.sh
Expand Down
39 changes: 32 additions & 7 deletions .github/workflows/short-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ name: Short Tests
on:
# Triggers the workflow on push or pull request events but only for the mydev branch
push:
branches-ignore:
- "gh-readonly-queue**"
pull_request:

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
jobs:
SASS-Simulation:
runs-on: ubuntu-latest
container:
Expand All @@ -27,25 +29,48 @@ jobs:
run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
PTX-Simulation:
runs-on: ubuntu-latest
container:
image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
# env:

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- name: Run Simulation
run: echo "skipped SASS-Simulation. Will perform in merge queue"
run: echo "skipped PTX-Simulation. Will perform in merge queue"
Tracer-Tool:
runs-on: ubuntu-latest
container:
image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
# env:

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- name: Run Simulation
run: echo "skipped SASS-Simulation. Will perform in merge queue"
run: echo "skipped Tracer-Simulation. Will perform in merge queue"
format-code:
runs-on: ubuntu-latest
needs: [SASS-Simulation, PTX-Simulation, Tracer-Tool]

permissions:
# Give the default GITHUB_TOKEN write permission to commit and push the
# added or changed files to the repository.
contents: write

steps:
- uses: actions/checkout@v4
# Other steps that change files in the repository go here
#
- name: Run clang-format
run: |
sudo apt-get install -y clang-format
./gpu-simulator/format-code.sh
./util/tracer_nvbit/tracer_tool/format-code.sh
- uses: stefanzweifel/git-auto-commit-action@v5
with:
# Optional. Commit message for the created commit.
# Defaults to "Apply automatic changes"
commit_message: Automated clang-format
# Optional. Option used by `git-status` to determine if the repository is
# dirty. See https://git-scm.com/docs/git-status#_options
status_options: '--untracked-files=no'
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pipeline {
source ./gpu-simulator/setup_environment.sh
rm -rf ./gpu-app-collection
git clone [email protected]:accel-sim/gpu-app-collection.git
git clone -b dev [email protected]:accel-sim/gpu-app-collection.git
source ./gpu-app-collection/src/setup_environment
srun -c20 make rodinia_2.0-ft GPU_Microbenchmark -j20 -C ./gpu-app-collection/src
./gpu-app-collection/get_regression_data.sh
Expand Down
2 changes: 1 addition & 1 deletion gpu-simulator/ISA_Def/ampere_opcode.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ static const std::unordered_map<std::string, OpcodeChar> Ampere_OpcodeMap = {
{"CCTLT", OpcodeChar(OP_CCTLT, ALU_OP)},

{"LDGDEPBAR", OpcodeChar(OP_LDGDEPBAR, ALU_OP)},
{"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)},
{"LDGSTS", OpcodeChar(OP_LDGSTS, LOAD_OP)},

// Uniform Datapath Instruction
// UDP unit
Expand Down
30 changes: 14 additions & 16 deletions gpu-simulator/trace-driven/trace_driven.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,10 @@ bool trace_warp_inst_t::parse_from_trace_struct(
cache_op = CACHE_ALL;
break;
case OP_LDG:
// LDGSTS is loading the values needed directly from the global memory to shared memory.
// Before this feature, the values need to be loaded to registers first, then store to
// the shared memory.
case OP_LDGSTS: // Add for memcpy_async
// LDGSTS is loading the values needed directly from the global memory to
// shared memory. Before this feature, the values need to be loaded to
// registers first, then store to the shared memory.
case OP_LDGSTS: // Add for memcpy_async
case OP_LDL:
assert(data_size > 0);
memory_op = memory_load;
Expand All @@ -272,8 +272,7 @@ bool trace_warp_inst_t::parse_from_trace_struct(
else
space.set_type(global_space);
// Add for LDGSTS instruction
if (m_opcode == OP_LDGSTS)
m_is_ldgsts = true;
if (m_opcode == OP_LDGSTS) m_is_ldgsts = true;
// check the cache scope, if its strong GPU, then bypass L1
if (trace.check_opcode_contain(opcode_tokens, "STRONG") &&
trace.check_opcode_contain(opcode_tokens, "GPU")) {
Expand Down Expand Up @@ -369,19 +368,18 @@ bool trace_warp_inst_t::parse_from_trace_struct(
// barrier_type bar_type;
// reduction_type red_type;
break;
// LDGDEPBAR is to form a group containing the previous LDGSTS instructions that
// have not been grouped yet.
// In the implementation, a group number will be assigned once the instruction is
// met.
// LDGDEPBAR is to form a group containing the previous LDGSTS instructions
// that have not been grouped yet. In the implementation, a group number
// will be assigned once the instruction is met.
case OP_LDGDEPBAR:
m_is_ldgdepbar = true;
break;
// DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS
// instructions. It is associated with a immediate value. The immediate value
// indicates the last N LDGDEPBAR groups to not wait once the instruction is met.
// For example, if the immediate value is 1, then the last group is able to proceed
// even with DEPBAR present; if the immediate value is 0, then all of the groups
// need to finish before proceed.
// DEPBAR is served as a warp-wise barrier that is only effective for LDGSTS
// instructions. It is associated with a immediate value. The immediate
// value indicates the last N LDGDEPBAR groups to not wait once the
// instruction is met. For example, if the immediate value is 1, then the
// last group is able to proceed even with DEPBAR present; if the immediate
// value is 0, then all of the groups need to finish before proceed.
case OP_DEPBAR:
m_is_depbar = true;
m_depbar_group_no = trace.imm;
Expand Down
41 changes: 22 additions & 19 deletions gpu-simulator/trace-parser/trace_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include <vector>

#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <unistd.h>

#include "trace_parser.h"

Expand All @@ -32,7 +32,10 @@ void split(const std::string &str, std::vector<std::string> &cont,
}
}

inst_trace_t::inst_trace_t() { memadd_info = NULL; imm = 0;}
inst_trace_t::inst_trace_t() {
memadd_info = NULL;
imm = 0;
}

inst_trace_t::~inst_trace_t() {
if (memadd_info != NULL) delete memadd_info;
Expand Down Expand Up @@ -169,7 +172,7 @@ bool inst_trace_t::parse_from_string(std::string trace, unsigned trace_version,
ss >> temp;
sscanf(temp.c_str(), "R%d", &reg_src[i]);
}

// parse mem info
unsigned address_mode = 0;
unsigned mem_width = 0;
Expand Down Expand Up @@ -287,60 +290,60 @@ kernel_trace_t *trace_parser::parse_kernel_info(

std::string read_trace_cmd;
int _l = kerneltraces_filepath.length();
if(_l > 3 && kerneltraces_filepath.substr(_l-3, 3) == ".xz"){
if (_l > 3 && kerneltraces_filepath.substr(_l - 3, 3) == ".xz") {
// this is xz-compressed trace
read_trace_cmd = "xz -dc " + kerneltraces_filepath;
} else if(_l > 7 && kerneltraces_filepath.substr(_l-7, 7) == ".traceg"){
} else if (_l > 7 && kerneltraces_filepath.substr(_l - 7, 7) == ".traceg") {
// this is plain text trace
read_trace_cmd ="cat " + kerneltraces_filepath;
read_trace_cmd = "cat " + kerneltraces_filepath;
} else {
std::cerr << "Can't read trace. Only .xz and plain text are supported: "
<< kerneltraces_filepath <<"\n";
std::cerr << "Can't read trace. Only .xz and plain text are supported: "
<< kerneltraces_filepath << "\n";
exit(1);
}

// Create an interprocess channel, and fork out a data source process. The
// data source process reads trace from disk, write to the channel, and the
// simulator process read from the channel.
// simulator process read from the channel.
int *pipefd = kernel_info->pipefd;
if(pipe(pipefd) != 0){
if (pipe(pipefd) != 0) {
std::cerr << "Failed to create interprocess channel\n";
perror("pipe");
exit(1);
}

pid_t pid = fork();
if(pid == 0){
if (pid == 0) {
// The child process is the data source. Redirect its
// stdout to the write end of the pipe.
close(pipefd[0]);
dup2(pipefd[1], STDOUT_FILENO);

// When using GDB, sending Ctrl+C to the simulator will send a SIGINT signal
// to the child process as well, subsequently causing it to terminate. To
// avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
// avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
// Reference:
// https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
// https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
signal(SIGINT, SIG_IGN);

execle("/bin/sh", "sh", "-c", read_trace_cmd.c_str(), NULL, environ);
perror("execle"); // the child process shouldn't reach here if all is well.
perror("execle"); // the child process shouldn't reach here if all is well.
exit(1);
} else {
// parent (simulator)
close(pipefd[1]);
dup2(pipefd[0], STDIN_FILENO);
}
// Parent continues from here.
kernel_info->ifs = &std::cin;

// Parent continues from here.
kernel_info->ifs = &std::cin;
std::istream *ifs = kernel_info->ifs;

std::cout << "Processing kernel " << kerneltraces_filepath << std::endl;

std::string line;

// Important to clear the istream. Otherwise, the eofbit from the last
// Important to clear the istream. Otherwise, the eofbit from the last
// kernel may be carried over to this kernel
ifs->clear();
clearerr(stdin);
Expand Down Expand Up @@ -417,7 +420,7 @@ void trace_parser::kernel_finalizer(kernel_trace_t *trace_info) {
// The pipe read/write end file descriptors held by the child process would
// have been automatically closed when it terminated. But the parent
// process may read an arbitrary amount of trace files, so it has to close
// all file descriptors.
// all file descriptors.
close(trace_info->pipefd[0]);
close(trace_info->pipefd[1]);
delete trace_info;
Expand Down
4 changes: 2 additions & 2 deletions gpu-simulator/trace-parser/trace_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,12 @@ struct kernel_trace_t {
std::istream *ifs;
// Anonymous pipe through which the trace is transmitted from a trace reader
// process to the simulator process
int pipefd[2]={};
int pipefd[2] = {};
};

class trace_parser {
public:
trace_parser(){}
trace_parser() {}
trace_parser(const char *kernellist_filepath);

std::vector<trace_command> parse_commandlist_file();
Expand Down
2 changes: 2 additions & 0 deletions short-tests.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -e

if [ ! -n "$CUDA_INSTALL_PATH" ]; then
echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
exit;
Expand Down
7 changes: 7 additions & 0 deletions util/job_launching/apps/define-all-apps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ GPU_Microbenchmark:
- shared_lat:
- args:
accel-sim-mem: 1G
- shared_bank_conflicts:
## argument 1 kernel has conflicts
- args: 1
accel-sim-mem: 1G
## argument 2 kernel doesn't have conflicts
- args: 2
accel-sim-mem: 1G
- MaxFlops:
- args:
accel-sim-mem: 1G
Expand Down
2 changes: 2 additions & 0 deletions util/job_launching/stats/example_stats.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ collect_aggregate:
- 'total dram reads\s*=\s*(.*)'
- 'total dram writes\s*=\s*(.*)'
- 'kernel_launch_uid\s*=\s*(.*)'
- 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)'
- 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)'


# These stats are reset each kernel and should not be diff'd
Expand Down
14 changes: 12 additions & 2 deletions util/plotting/correl_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,18 @@
plottype="log",
stattype="counter",
),
CorrelStat(
chart_name="DRAM Reads",
CorrelStat(chart_name="Shared Memory Bank Conflicts",
plotfile="shmem-bank-conflict",
hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\
+ np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])",
hw_error=None,
sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])",
hw_name="all",
drophwnumbelow=0,
plottype="log",
stattype="counter"
),
CorrelStat(chart_name="DRAM Reads",
plotfile="dram-read-transactions",
hw_eval='np.average(hw["dram__sectors_read.sum"])',
hw_error=None,
Expand Down
15 changes: 8 additions & 7 deletions util/tracer_nvbit/tracer_tool/inject_funcs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
* extern "C" __device__ __noinline__
* To prevent "dead"-code elimination by the compiler.
*/
extern "C" __device__ __noinline__ void instrument_inst(
int pred, int opcode_id, int32_t vpc, bool is_mem, uint64_t addr,
int32_t width, int32_t desReg, int32_t srcReg1, int32_t srcReg2,
int32_t srcReg3, int32_t srcReg4, int32_t srcReg5, int32_t srcNum,
uint64_t immediate,
uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, uint32_t line_num) {
extern "C" __device__ __noinline__ void
instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
uint64_t addr, int32_t width, int32_t desReg, int32_t srcReg1,
int32_t srcReg2, int32_t srcReg3, int32_t srcReg4,
int32_t srcReg5, int32_t srcNum, uint64_t immediate,
uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
uint64_t preported_dynamic_instr_counter, uint64_t pstop_report,
uint32_t line_num) {

const int active_mask = __ballot_sync(__activemask(), 1);
const int predicate_mask = __ballot_sync(__activemask(), pred);
Expand Down
Loading

0 comments on commit 7ef7f01

Please sign in to comment.