Skip to content

Commit f435ddd

Browse files
committed
Update benchmarks
1 parent 8339e58 commit f435ddd

15 files changed

+116
-84
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@ Manifest.toml
2727

2828
*.dll
2929
*.exp
30-
*.lib
30+
*.lib
31+
*.so

AMDGPU/LocalPreferences.toml

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[AMDGPU]
2+
use_artifacts = false

AMDGPU/Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
33
AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
44
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
5+
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
56
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

AMDGPU/common.jl

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1+
using AMDGPU
2+
using BenchmarkTools
3+
using Statistics
4+
using Libdl
15
using AxisKeys
6+
using Plots
27

38
function judge_map(estimates)
49
ek = keys(estimates) |> collect
@@ -15,4 +20,7 @@ function make_c_trial(nsamples)
1520
c_params = BenchmarkTools.DEFAULT_PARAMETERS
1621
c_params.samples = nsamples
1722
return BenchmarkTools.Trial(c_params,c_times,c_gctimes,c_memory,c_allocs)
18-
end
23+
end
24+
25+
RESULTS = BenchmarkGroup()
26+
INPUTS = Dict()

AMDGPU/diffusion_2d.cu

+12-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include <cuda.h>
1+
#include "hip/hip_runtime.h"
2+
#include <hip/hip_runtime.h>
23
#include <stdint.h>
34

45
#include <chrono>
@@ -25,28 +26,28 @@ __global__ void diffusion_kernel(double *_A_new, const double *_A, const int n,
2526

2627
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples, const int n) {
2728
double *A_new, *A;
28-
cudaMalloc(&A_new, (n - 2) * (n - 2) * sizeof(double));
29-
cudaMalloc(&A, n * n * sizeof(double));
29+
hipMalloc(&A_new, (n - 2) * (n - 2) * sizeof(double));
30+
hipMalloc(&A, n * n * sizeof(double));
3031

3132
double h = 1.0 / 5.0;
3233

33-
cudaStream_t stream;
34-
cudaStreamCreate(&stream);
34+
hipStream_t stream;
35+
hipStreamCreate(&stream);
3536

36-
dim3 nthreads(32, 8);
37+
dim3 nthreads(128, 2);
3738
dim3 nblocks((n + nthreads.x - 1) / nthreads.x, (n + nthreads.y - 1) / nthreads.y);
3839

3940
for (int isample = 0; isample < nsamples; ++isample) {
4041
auto timer = high_resolution_clock::now();
41-
diffusion_kernel<<<nblocks, nthreads, 0, stream>>>(A_new, A, n, h);
42-
cudaStreamSynchronize(stream);
42+
hipLaunchKernelGGL(diffusion_kernel, nblocks, nthreads, 0, stream, A_new, A, n, h);
43+
hipStreamSynchronize(stream);
4344
auto elapsed = high_resolution_clock::now() - timer;
4445
auto time_total = duration_cast<nano_double>(elapsed).count();
4546
times[isample] = time_total;
4647
}
4748

48-
cudaFree(A_new);
49-
cudaFree(A);
49+
hipFree(A_new);
50+
hipFree(A);
5051

51-
cudaStreamDestroy(stream);
52+
hipStreamDestroy(stream);
5253
}

AMDGPU/diffusion_2d.jl

+25-18
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,60 @@
1-
N = 4096
2-
C_SAMPLES = 500
1+
INPUTS["diffusion-2d"] = (
2+
c_samples = 2000,
3+
n_range = 2 .^ (8:2:14),
4+
)
35

46
function diffusion_kernel!(A_new,A,h)
5-
ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
6-
iy = (blockIdx().y-1i32)*blockDim().y + threadIdx().y
7+
ix = (workgroupIdx().x-1)*workgroupDim().x + workitemIdx().x
8+
iy = (workgroupIdx().y-1)*workgroupDim().y + workitemIdx().y
79
if ix axes(A_new,1) && iy axes(A_new,2)
810
@inbounds A_new[ix,iy] = A[ix+1,iy+1] + h*(A[ix,iy+1] + A[ix+2,iy+1] + A[ix+1,iy] + A[ix+1,iy+2] - 4.0*A[ix+1,iy+1])
911
end
1012
return
1113
end
1214

1315
function run_julia_benchmarks(n)
14-
A_new = CuArray{Float64}(undef,n-2,n-2)
15-
A = CuArray{Float64}(undef,n ,n )
16+
A_new = ROCArray{Float64}(undef,n-2,n-2)
17+
A = ROCArray{Float64}(undef,n ,n )
1618
h = 1/5
17-
nthreads = (32,8)
19+
nthreads = (128,2)
1820
nblocks = cld.(size(A_new),nthreads)
1921

2022
bm = @benchmark begin
21-
CUDA.@sync @cuda blocks=$nblocks threads=$nthreads diffusion_kernel!($A_new,$A,$h)
23+
@roc gridsize=$nblocks groupsize=$nthreads diffusion_kernel!($A_new,$A,$h)
24+
AMDGPU.synchronize()
2225
end
2326

24-
CUDA.unsafe_free!(A_new)
25-
CUDA.unsafe_free!(A)
27+
AMDGPU.unsafe_free!(A_new)
28+
AMDGPU.unsafe_free!(A)
2629

2730
return bm
2831
end
2932

3033
function run_c_benchmarks(lib,nsamples,n)
3134
trial = make_c_trial(nsamples)
3235

33-
CUDA.reclaim()
34-
35-
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
36+
sym = Libdl.dlsym(lib,:run_benchmark)
3637
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
3738

3839
return trial
3940
end
4041

4142
group = BenchmarkGroup()
42-
group["julia"] = run_julia_benchmarks(N)
4343

44-
# Add baseline C benchmark
44+
# Compile C benchmark
4545
libext = Sys.iswindows() ? "dll" : "so"
4646
libname = "diffusion_2d." * libext
47-
run(`nvcc -O3 -o $libname --shared diffusion_2d.cu`)
47+
run(`hipcc -O3 -o $libname --shared -fPIC diffusion_2d.cu`)
48+
4849
Libdl.dlopen("./$libname") do lib
49-
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N)
50+
for N in INPUTS["diffusion-2d"].n_range
51+
@info "N = $N"
52+
group_n = BenchmarkGroup()
53+
group_n["julia"] = run_julia_benchmarks(N)
54+
group_n["reference"] = run_c_benchmarks(lib,INPUTS["diffusion-2d"].c_samples,N)
55+
group[N] = group_n
56+
display(group_n)
57+
end
5058
end
5159

5260
RESULTS["diffusion-2d"] = group
53-

AMDGPU/host_overhead.cu

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include <cuda.h>
1+
#include "hip/hip_runtime.h"
2+
#include <hip/hip_runtime.h>
23
#include <stdint.h>
34

45
#include <chrono>
@@ -20,17 +21,17 @@ __global__ void sleep_kernel(const int64_t ncycles) {
2021

2122
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
2223
const int64_t ncycles) {
23-
cudaStream_t stream;
24-
cudaStreamCreate(&stream);
24+
hipStream_t stream;
25+
hipStreamCreate(&stream);
2526

2627
for (int isample = 0; isample < nsamples; ++isample) {
2728
auto timer = high_resolution_clock::now();
28-
sleep_kernel<<<1, 1, 0, stream>>>(ncycles);
29-
cudaStreamSynchronize(stream);
29+
hipLaunchKernelGGL(sleep_kernel, 1, 1, 0, stream, ncycles);
30+
hipStreamSynchronize(stream);
3031
auto elapsed = high_resolution_clock::now() - timer;
3132
auto time_total = duration_cast<nano_double>(elapsed).count();
3233
times[isample] = time_total;
3334
}
3435

35-
cudaStreamDestroy(stream);
36+
hipStreamDestroy(stream);
3637
}

AMDGPU/host_overhead.jl

+1-4
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ end
2929
function run_c_benchmarks(lib,nsamples,ncycles)
3030
trial = make_c_trial(nsamples)
3131

32-
CUDA.reclaim()
33-
34-
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
3532
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,ncycles::Cint)::Cvoid
3633

3734
return trial
@@ -45,7 +42,7 @@ group = run_julia_benchmarks(ncycles)
4542
# Add baseline C benchmark
4643
libext = Sys.iswindows() ? "dll" : "so"
4744
libname = "host_overhead." * libext
48-
run(`nvcc -O3 -o $libname --shared host_overhead.cu`)
45+
run(`hipcc -O3 -o $libname --shared -fPIC host_overhead.cu`)
4946
Libdl.dlopen("./$libname") do lib
5047
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,ncycles)
5148
end

AMDGPU/memcopy.cu

+13-12
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include <cuda.h>
1+
#include "hip/hip_runtime.h"
2+
#include <hip/hip_runtime.h>
23
#include <stdint.h>
34

45
#include <chrono>
@@ -11,7 +12,7 @@ using nano_double = duration<double, std::nano>;
1112
#define EXPORT_API
1213
#endif
1314

14-
__global__ void memcopy_kernel(uint8_t *dst, const uint8_t *src, const int n) {
15+
__global__ void memcopy_kernel(double *dst, const double *src, const int n) {
1516
int ix = blockIdx.x * blockDim.x + threadIdx.x;
1617
if (ix < n) {
1718
dst[ix] = src[ix];
@@ -20,27 +21,27 @@ __global__ void memcopy_kernel(uint8_t *dst, const uint8_t *src, const int n) {
2021

2122
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
2223
const int n) {
23-
uint8_t *dst, *src;
24-
cudaMalloc(&dst, n);
25-
cudaMalloc(&src, n);
24+
double *dst, *src;
25+
hipMalloc(&dst, n * sizeof(double));
26+
hipMalloc(&src, n * sizeof(double));
2627

27-
cudaStream_t stream;
28-
cudaStreamCreate(&stream);
28+
hipStream_t stream;
29+
hipStreamCreate(&stream);
2930

3031
int nthreads = 256;
3132
int nblocks = (n + nthreads - 1) / nthreads;
3233

3334
for (int isample = 0; isample < nsamples; ++isample) {
3435
auto timer = high_resolution_clock::now();
35-
memcopy_kernel<<<nblocks, nthreads, 0, stream>>>(dst, src, n);
36-
cudaStreamSynchronize(stream);
36+
hipLaunchKernelGGL(memcopy_kernel, nblocks, nthreads, 0, stream, dst, src, n);
37+
hipStreamSynchronize(stream);
3738
auto elapsed = high_resolution_clock::now() - timer;
3839
auto time_total = duration_cast<nano_double>(elapsed).count();
3940
times[isample] = time_total;
4041
}
4142

42-
cudaFree(src);
43-
cudaFree(dst);
43+
hipFree(src);
44+
hipFree(dst);
4445

45-
cudaStreamDestroy(stream);
46+
hipStreamDestroy(stream);
4647
}

AMDGPU/memcopy.jl

+26-18
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,58 @@
1-
N_BYTES = 10^8
2-
C_SAMPLES = 500
1+
INPUTS["memcopy"] = (
2+
n_range = 2 .^ (16:2:28),
3+
c_samples = 2000,
4+
)
35

46
function memcopy_kernel!(dst,src)
5-
ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
7+
ix = (workgroupIdx().x-1)*workgroupDim().x + workitemIdx().x
68
if ix <= length(dst)
79
@inbounds dst[ix] = src[ix]
810
end
911
return
1012
end
1113

12-
function run_julia_benchmarks(nbytes)
13-
dst = CuArray{UInt8}(undef,nbytes)
14-
src = CuArray{UInt8}(undef,nbytes)
14+
function run_julia_benchmarks(n)
15+
dst = ROCArray{Float64}(undef,n)
16+
src = ROCArray{Float64}(undef,n)
1517
nthreads = 256
1618
nblocks = cld(length(dst),nthreads)
1719

1820
bm = @benchmark begin
19-
CUDA.@sync @cuda blocks=$nblocks threads=$nthreads memcopy_kernel!($dst,$src)
21+
@roc gridsize=$nblocks groupsize=$nthreads memcopy_kernel!($dst,$src)
22+
AMDGPU.synchronize()
2023
end
2124

22-
CUDA.unsafe_free!(dst)
23-
CUDA.unsafe_free!(src)
25+
AMDGPU.unsafe_free!(dst)
26+
AMDGPU.unsafe_free!(src)
2427

2528
return bm
2629
end
2730

28-
function run_c_benchmarks(lib,nsamples,nbytes)
31+
function run_c_benchmarks(lib,nsamples,n)
2932
trial = make_c_trial(nsamples)
3033

31-
CUDA.reclaim()
32-
33-
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
34-
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,nbytes::Cint)::Cvoid
34+
sym = Libdl.dlsym(lib,:run_benchmark)
35+
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
3536

3637
return trial
3738
end
3839

3940
group = BenchmarkGroup()
40-
group["julia"] = run_julia_benchmarks(N_BYTES)
4141

42-
# Add baseline C benchmark
42+
# Compile C benchmark
4343
libext = Sys.iswindows() ? "dll" : "so"
4444
libname = "memcopy." * libext
45-
run(`nvcc -O3 -o $libname --shared memcopy.cu`)
45+
run(`hipcc -O3 -o $libname --shared -fPIC memcopy.cu`)
46+
4647
Libdl.dlopen("./$libname") do lib
47-
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N_BYTES)
48+
for N in INPUTS["memcopy"].n_range
49+
@info "N = $N"
50+
group_n = BenchmarkGroup()
51+
group_n["julia"] = run_julia_benchmarks(N)
52+
group_n["reference"] = run_c_benchmarks(lib,INPUTS["memcopy"].c_samples,N)
53+
group[N] = group_n
54+
display(group_n)
55+
end
4856
end
4957

5058
RESULTS["memcopy"] = group

AMDGPU/runbenchmarks.jl

+12-10
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,3 @@
1-
using CUDA
2-
import CUDA: i32
3-
4-
using BenchmarkTools
5-
using Statistics
6-
using Libdl
7-
8-
RESULTS = BenchmarkGroup()
9-
101
include("common.jl")
112

123
@info "host overhead"
@@ -16,4 +7,15 @@ include("host_overhead.jl")
167
include("memcopy.jl")
178

189
@info "diffusion"
19-
include("diffusion_2d.jl")
10+
include("diffusion_2d.jl")
11+
12+
abstract type HPCBenchmark end
13+
14+
_BENCHMARKS = Dict{Symbol, HPCBenchmark}
15+
16+
17+
function runbenchmarks(benchmarks=:all)
18+
if benchmarks == :all
19+
benchmarks = collect(keys(_BENCHMARKS))
20+
end
21+
end

AMDGPU/setup_env.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module load LUMI/22.08
2+
module load partition/G
3+
module load rocm/5.3.3

CUDA/diffusion_2d.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function run_c_benchmarks(lib,nsamples,n)
3232

3333
CUDA.reclaim()
3434

35-
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
35+
sym = Libdl.dlsym(lib,:run_benchmark)
3636
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
3737

3838
return trial

0 commit comments

Comments
 (0)