Skip to content

Commit 8339e58

Browse files
committed
Add AMDGPU files
1 parent 8add81c commit 8339e58

10 files changed

+335
-0
lines changed

AMDGPU/.clang-format

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
BasedOnStyle: LLVM
2+
ColumnLimit: 120

AMDGPU/Project.toml

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[deps]
2+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
3+
AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
4+
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
5+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

AMDGPU/common.jl

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
using AxisKeys
2+
3+
function judge_map(estimates)
4+
ek = keys(estimates) |> collect
5+
ev = values(estimates)
6+
jm = [judge(e1,e2) for e1 in ev, e2 in ev]
7+
return KeyedArray(jm, to=ek, from=ek)
8+
end
9+
10+
function make_c_trial(nsamples)
11+
c_times = zeros(Float64,nsamples)
12+
c_gctimes = zeros(Float64,nsamples)
13+
c_memory = 0::Int64
14+
c_allocs = 0::Int64
15+
c_params = BenchmarkTools.DEFAULT_PARAMETERS
16+
c_params.samples = nsamples
17+
return BenchmarkTools.Trial(c_params,c_times,c_gctimes,c_memory,c_allocs)
18+
end

AMDGPU/diffusion_2d.cu

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <cuda.h>
2+
#include <stdint.h>
3+
4+
#include <chrono>
5+
using namespace std::chrono;
6+
using nano_double = duration<double, std::nano>;
7+
8+
#ifdef _WIN32
9+
#define EXPORT_API __declspec(dllexport)
10+
#else
11+
#define EXPORT_API
12+
#endif
13+
14+
#define A_new(ix, iy) _A_new[(iy) * (n - 2) + ix]
15+
#define A(ix, iy) _A[(iy)*n + ix]
16+
17+
__global__ void diffusion_kernel(double *_A_new, const double *_A, const int n, const double h) {
18+
int ix = blockIdx.x * blockDim.x + threadIdx.x;
19+
int iy = blockIdx.y * blockDim.y + threadIdx.y;
20+
if (ix < n - 2 && iy < n - 2) {
21+
A_new(ix, iy) = A(ix + 1, iy + 1) + h * (A(ix, iy + 1) + A(ix + 2, iy + 1) + A(ix + 1, iy) + A(ix + 1, iy + 2) -
22+
4.0 * A(ix + 1, iy + 1));
23+
}
24+
}
25+
26+
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples, const int n) {
27+
double *A_new, *A;
28+
cudaMalloc(&A_new, (n - 2) * (n - 2) * sizeof(double));
29+
cudaMalloc(&A, n * n * sizeof(double));
30+
31+
double h = 1.0 / 5.0;
32+
33+
cudaStream_t stream;
34+
cudaStreamCreate(&stream);
35+
36+
dim3 nthreads(32, 8);
37+
dim3 nblocks((n + nthreads.x - 1) / nthreads.x, (n + nthreads.y - 1) / nthreads.y);
38+
39+
for (int isample = 0; isample < nsamples; ++isample) {
40+
auto timer = high_resolution_clock::now();
41+
diffusion_kernel<<<nblocks, nthreads, 0, stream>>>(A_new, A, n, h);
42+
cudaStreamSynchronize(stream);
43+
auto elapsed = high_resolution_clock::now() - timer;
44+
auto time_total = duration_cast<nano_double>(elapsed).count();
45+
times[isample] = time_total;
46+
}
47+
48+
cudaFree(A_new);
49+
cudaFree(A);
50+
51+
cudaStreamDestroy(stream);
52+
}

AMDGPU/diffusion_2d.jl

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
N = 4096
2+
C_SAMPLES = 500
3+
4+
function diffusion_kernel!(A_new,A,h)
5+
ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
6+
iy = (blockIdx().y-1i32)*blockDim().y + threadIdx().y
7+
if ix axes(A_new,1) && iy axes(A_new,2)
8+
@inbounds A_new[ix,iy] = A[ix+1,iy+1] + h*(A[ix,iy+1] + A[ix+2,iy+1] + A[ix+1,iy] + A[ix+1,iy+2] - 4.0*A[ix+1,iy+1])
9+
end
10+
return
11+
end
12+
13+
function run_julia_benchmarks(n)
14+
A_new = CuArray{Float64}(undef,n-2,n-2)
15+
A = CuArray{Float64}(undef,n ,n )
16+
h = 1/5
17+
nthreads = (32,8)
18+
nblocks = cld.(size(A_new),nthreads)
19+
20+
bm = @benchmark begin
21+
CUDA.@sync @cuda blocks=$nblocks threads=$nthreads diffusion_kernel!($A_new,$A,$h)
22+
end
23+
24+
CUDA.unsafe_free!(A_new)
25+
CUDA.unsafe_free!(A)
26+
27+
return bm
28+
end
29+
30+
function run_c_benchmarks(lib,nsamples,n)
31+
trial = make_c_trial(nsamples)
32+
33+
CUDA.reclaim()
34+
35+
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
36+
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
37+
38+
return trial
39+
end
40+
41+
group = BenchmarkGroup()
42+
group["julia"] = run_julia_benchmarks(N)
43+
44+
# Add baseline C benchmark
45+
libext = Sys.iswindows() ? "dll" : "so"
46+
libname = "diffusion_2d." * libext
47+
run(`nvcc -O3 -o $libname --shared diffusion_2d.cu`)
48+
Libdl.dlopen("./$libname") do lib
49+
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N)
50+
end
51+
52+
RESULTS["diffusion-2d"] = group
53+

AMDGPU/host_overhead.cu

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include <cuda.h>
2+
#include <stdint.h>
3+
4+
#include <chrono>
5+
using namespace std::chrono;
6+
using nano_double = duration<double, std::nano>;
7+
8+
#ifdef _WIN32
9+
#define EXPORT_API __declspec(dllexport)
10+
#else
11+
#define EXPORT_API
12+
#endif
13+
14+
__global__ void sleep_kernel(const int64_t ncycles) {
15+
int64_t start = clock64();
16+
while (clock64() - start < ncycles) {
17+
__syncthreads();
18+
}
19+
}
20+
21+
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
22+
const int64_t ncycles) {
23+
cudaStream_t stream;
24+
cudaStreamCreate(&stream);
25+
26+
for (int isample = 0; isample < nsamples; ++isample) {
27+
auto timer = high_resolution_clock::now();
28+
sleep_kernel<<<1, 1, 0, stream>>>(ncycles);
29+
cudaStreamSynchronize(stream);
30+
auto elapsed = high_resolution_clock::now() - timer;
31+
auto time_total = duration_cast<nano_double>(elapsed).count();
32+
times[isample] = time_total;
33+
}
34+
35+
cudaStreamDestroy(stream);
36+
}

AMDGPU/host_overhead.jl

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
N_MILLISEC = 2
2+
C_SAMPLES = 2000
3+
4+
function sleep_kernel(ncycles)
5+
start = CUDA.clock(UInt64)
6+
while CUDA.clock(UInt64) - start < ncycles
7+
sync_threads()
8+
end
9+
return
10+
end
11+
12+
function run_julia_benchmarks(ncycles)
13+
suite = BenchmarkGroup()
14+
15+
suite["nonblocking"] = @benchmarkable begin
16+
@cuda sleep_kernel($ncycles)
17+
CUDA.synchronize()
18+
end
19+
20+
suite["blocking"] = @benchmarkable begin
21+
@cuda sleep_kernel($ncycles)
22+
CUDA.cuStreamSynchronize(stream())
23+
end
24+
25+
warmup(suite)
26+
return run(suite)
27+
end
28+
29+
function run_c_benchmarks(lib,nsamples,ncycles)
30+
trial = make_c_trial(nsamples)
31+
32+
CUDA.reclaim()
33+
34+
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
35+
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,ncycles::Cint)::Cvoid
36+
37+
return trial
38+
end
39+
40+
clock_rate = CUDA.attribute(device(),CUDA.DEVICE_ATTRIBUTE_CLOCK_RATE)
41+
ncycles = N_MILLISEC*clock_rate
42+
43+
group = run_julia_benchmarks(ncycles)
44+
45+
# Add baseline C benchmark
46+
libext = Sys.iswindows() ? "dll" : "so"
47+
libname = "host_overhead." * libext
48+
run(`nvcc -O3 -o $libname --shared host_overhead.cu`)
49+
Libdl.dlopen("./$libname") do lib
50+
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,ncycles)
51+
end
52+
53+
RESULTS["host-overhead"] = group

AMDGPU/memcopy.cu

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include <cuda.h>
2+
#include <stdint.h>
3+
4+
#include <chrono>
5+
using namespace std::chrono;
6+
using nano_double = duration<double, std::nano>;
7+
8+
#ifdef _WIN32
9+
#define EXPORT_API __declspec(dllexport)
10+
#else
11+
#define EXPORT_API
12+
#endif
13+
14+
__global__ void memcopy_kernel(uint8_t *dst, const uint8_t *src, const int n) {
15+
int ix = blockIdx.x * blockDim.x + threadIdx.x;
16+
if (ix < n) {
17+
dst[ix] = src[ix];
18+
}
19+
}
20+
21+
extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
22+
const int n) {
23+
uint8_t *dst, *src;
24+
cudaMalloc(&dst, n);
25+
cudaMalloc(&src, n);
26+
27+
cudaStream_t stream;
28+
cudaStreamCreate(&stream);
29+
30+
int nthreads = 256;
31+
int nblocks = (n + nthreads - 1) / nthreads;
32+
33+
for (int isample = 0; isample < nsamples; ++isample) {
34+
auto timer = high_resolution_clock::now();
35+
memcopy_kernel<<<nblocks, nthreads, 0, stream>>>(dst, src, n);
36+
cudaStreamSynchronize(stream);
37+
auto elapsed = high_resolution_clock::now() - timer;
38+
auto time_total = duration_cast<nano_double>(elapsed).count();
39+
times[isample] = time_total;
40+
}
41+
42+
cudaFree(src);
43+
cudaFree(dst);
44+
45+
cudaStreamDestroy(stream);
46+
}

AMDGPU/memcopy.jl

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
N_BYTES = 10^8
2+
C_SAMPLES = 500
3+
4+
function memcopy_kernel!(dst,src)
5+
ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
6+
if ix <= length(dst)
7+
@inbounds dst[ix] = src[ix]
8+
end
9+
return
10+
end
11+
12+
function run_julia_benchmarks(nbytes)
13+
dst = CuArray{UInt8}(undef,nbytes)
14+
src = CuArray{UInt8}(undef,nbytes)
15+
nthreads = 256
16+
nblocks = cld(length(dst),nthreads)
17+
18+
bm = @benchmark begin
19+
CUDA.@sync @cuda blocks=$nblocks threads=$nthreads memcopy_kernel!($dst,$src)
20+
end
21+
22+
CUDA.unsafe_free!(dst)
23+
CUDA.unsafe_free!(src)
24+
25+
return bm
26+
end
27+
28+
function run_c_benchmarks(lib,nsamples,nbytes)
29+
trial = make_c_trial(nsamples)
30+
31+
CUDA.reclaim()
32+
33+
sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
34+
@ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,nbytes::Cint)::Cvoid
35+
36+
return trial
37+
end
38+
39+
group = BenchmarkGroup()
40+
group["julia"] = run_julia_benchmarks(N_BYTES)
41+
42+
# Add baseline C benchmark
43+
libext = Sys.iswindows() ? "dll" : "so"
44+
libname = "memcopy." * libext
45+
run(`nvcc -O3 -o $libname --shared memcopy.cu`)
46+
Libdl.dlopen("./$libname") do lib
47+
group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N_BYTES)
48+
end
49+
50+
RESULTS["memcopy"] = group
51+

AMDGPU/runbenchmarks.jl

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
using CUDA
2+
import CUDA: i32
3+
4+
using BenchmarkTools
5+
using Statistics
6+
using Libdl
7+
8+
RESULTS = BenchmarkGroup()
9+
10+
include("common.jl")
11+
12+
@info "host overhead"
13+
include("host_overhead.jl")
14+
15+
@info "memcopy"
16+
include("memcopy.jl")
17+
18+
@info "diffusion"
19+
include("diffusion_2d.jl")

0 commit comments

Comments
 (0)