|
1 |
| -N = 4096 |
2 |
| -C_SAMPLES = 500 |
| 1 | +INPUTS["diffusion-2d"] = ( |
| 2 | + c_samples = 2000, |
| 3 | + n_range = 2 .^ (8:2:14), |
| 4 | +) |
3 | 5 |
|
4 | 6 | function diffusion_kernel!(A_new,A,h)
|
5 |
| - ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x |
6 |
| - iy = (blockIdx().y-1i32)*blockDim().y + threadIdx().y |
| 7 | + ix = (workgroupIdx().x-1)*workgroupDim().x + workitemIdx().x |
| 8 | + iy = (workgroupIdx().y-1)*workgroupDim().y + workitemIdx().y |
7 | 9 | if ix ∈ axes(A_new,1) && iy ∈ axes(A_new,2)
|
8 | 10 | @inbounds A_new[ix,iy] = A[ix+1,iy+1] + h*(A[ix,iy+1] + A[ix+2,iy+1] + A[ix+1,iy] + A[ix+1,iy+2] - 4.0*A[ix+1,iy+1])
|
9 | 11 | end
|
10 | 12 | return
|
11 | 13 | end
|
12 | 14 |
|
13 | 15 | function run_julia_benchmarks(n)
|
14 |
| - A_new = CuArray{Float64}(undef,n-2,n-2) |
15 |
| - A = CuArray{Float64}(undef,n ,n ) |
| 16 | + A_new = ROCArray{Float64}(undef,n-2,n-2) |
| 17 | + A = ROCArray{Float64}(undef,n ,n ) |
16 | 18 | h = 1/5
|
17 |
| - nthreads = (32,8) |
| 19 | + nthreads = (128,2) |
18 | 20 | nblocks = cld.(size(A_new),nthreads)
|
19 | 21 |
|
20 | 22 | bm = @benchmark begin
|
21 |
| - CUDA.@sync @cuda blocks=$nblocks threads=$nthreads diffusion_kernel!($A_new,$A,$h) |
| 23 | + @roc gridsize=$nblocks groupsize=$nthreads diffusion_kernel!($A_new,$A,$h) |
| 24 | + AMDGPU.synchronize() |
22 | 25 | end
|
23 | 26 |
|
24 |
| - CUDA.unsafe_free!(A_new) |
25 |
| - CUDA.unsafe_free!(A) |
| 27 | + AMDGPU.unsafe_free!(A_new) |
| 28 | + AMDGPU.unsafe_free!(A) |
26 | 29 |
|
27 | 30 | return bm
|
28 | 31 | end
|
29 | 32 |
|
30 | 33 | function run_c_benchmarks(lib,nsamples,n)
|
31 | 34 | trial = make_c_trial(nsamples)
|
32 | 35 |
|
33 |
| - CUDA.reclaim() |
34 |
| - |
35 |
| - sym = CUDA.Libdl.dlsym(lib,:run_benchmark) |
| 36 | + sym = Libdl.dlsym(lib,:run_benchmark) |
36 | 37 | @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
|
37 | 38 |
|
38 | 39 | return trial
|
39 | 40 | end
|
40 | 41 |
|
41 | 42 | group = BenchmarkGroup()
|
42 |
| -group["julia"] = run_julia_benchmarks(N) |
43 | 43 |
|
44 |
| -# Add baseline C benchmark |
| 44 | +# Compile C benchmark |
45 | 45 | libext = Sys.iswindows() ? "dll" : "so"
|
46 | 46 | libname = "diffusion_2d." * libext
|
47 |
| -run(`nvcc -O3 -o $libname --shared diffusion_2d.cu`) |
| 47 | +run(`hipcc -O3 -o $libname --shared -fPIC diffusion_2d.cu`) |
| 48 | + |
48 | 49 | Libdl.dlopen("./$libname") do lib
|
49 |
| - group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N) |
| 50 | + for N in INPUTS["diffusion-2d"].n_range |
| 51 | + @info "N = $N" |
| 52 | + group_n = BenchmarkGroup() |
| 53 | + group_n["julia"] = run_julia_benchmarks(N) |
| 54 | + group_n["reference"] = run_c_benchmarks(lib,INPUTS["diffusion-2d"].c_samples,N) |
| 55 | + group[N] = group_n |
| 56 | + display(group_n) |
| 57 | + end |
50 | 58 | end
|
51 | 59 |
|
52 | 60 | RESULTS["diffusion-2d"] = group
|
53 |
| - |
|
0 commit comments