Update benchmarks

utkinis · utkinis · commit f435ddd40002 · 2023-06-27T17:40:59.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,5 @@ Manifest.toml
 
 *.dll
 *.exp
-*.lib
+*.lib
+*.so
diff --git a/AMDGPU/LocalPreferences.toml b/AMDGPU/LocalPreferences.toml
@@ -0,0 +1,2 @@
+[AMDGPU]
+use_artifacts = false
diff --git a/AMDGPU/Project.toml b/AMDGPU/Project.toml
@@ -2,4 +2,5 @@
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/AMDGPU/common.jl b/AMDGPU/common.jl
@@ -1,4 +1,9 @@
+using AMDGPU
+using BenchmarkTools
+using Statistics
+using Libdl
 using AxisKeys
+using Plots
 
 function judge_map(estimates)
     ek = keys(estimates) |> collect
@@ -15,4 +20,7 @@ function make_c_trial(nsamples)
     c_params  = BenchmarkTools.DEFAULT_PARAMETERS
     c_params.samples = nsamples
     return BenchmarkTools.Trial(c_params,c_times,c_gctimes,c_memory,c_allocs)
-end
+end
+
+RESULTS = BenchmarkGroup()
+INPUTS  = Dict()
diff --git a/AMDGPU/diffusion_2d.cu b/AMDGPU/diffusion_2d.cu
@@ -1,4 +1,5 @@
-#include <cuda.h>
+#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
 #include <stdint.h>
 
 #include <chrono>
@@ -25,28 +26,28 @@ __global__ void diffusion_kernel(double *_A_new, const double *_A, const int n,
 
 extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples, const int n) {
   double *A_new, *A;
-  cudaMalloc(&A_new, (n - 2) * (n - 2) * sizeof(double));
-  cudaMalloc(&A, n * n * sizeof(double));
+  hipMalloc(&A_new, (n - 2) * (n - 2) * sizeof(double));
+  hipMalloc(&A, n * n * sizeof(double));
 
   double h = 1.0 / 5.0;
 
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
 
-  dim3 nthreads(32, 8);
+  dim3 nthreads(128, 2);
   dim3 nblocks((n + nthreads.x - 1) / nthreads.x, (n + nthreads.y - 1) / nthreads.y);
 
   for (int isample = 0; isample < nsamples; ++isample) {
     auto timer = high_resolution_clock::now();
-    diffusion_kernel<<<nblocks, nthreads, 0, stream>>>(A_new, A, n, h);
-    cudaStreamSynchronize(stream);
+    hipLaunchKernelGGL(diffusion_kernel, nblocks, nthreads, 0, stream, A_new, A, n, h);
+    hipStreamSynchronize(stream);
     auto elapsed = high_resolution_clock::now() - timer;
     auto time_total = duration_cast<nano_double>(elapsed).count();
     times[isample] = time_total;
   }
 
-  cudaFree(A_new);
-  cudaFree(A);
+  hipFree(A_new);
+  hipFree(A);
 
-  cudaStreamDestroy(stream);
+  hipStreamDestroy(stream);
 }
diff --git a/AMDGPU/diffusion_2d.jl b/AMDGPU/diffusion_2d.jl
@@ -1,53 +1,60 @@
-N         = 4096
-C_SAMPLES = 500
+INPUTS["diffusion-2d"] = (
+    c_samples = 2000,
+    n_range   = 2 .^ (8:2:14),
+)
 
 function diffusion_kernel!(A_new,A,h)
-    ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
-    iy = (blockIdx().y-1i32)*blockDim().y + threadIdx().y
+    ix = (workgroupIdx().x-1)*workgroupDim().x + workitemIdx().x
+    iy = (workgroupIdx().y-1)*workgroupDim().y + workitemIdx().y
     if ix ∈ axes(A_new,1) && iy ∈ axes(A_new,2)
         @inbounds A_new[ix,iy] = A[ix+1,iy+1] + h*(A[ix,iy+1] + A[ix+2,iy+1] + A[ix+1,iy] + A[ix+1,iy+2] - 4.0*A[ix+1,iy+1])
     end
     return
 end
 
 function run_julia_benchmarks(n)
-    A_new = CuArray{Float64}(undef,n-2,n-2)
-    A     = CuArray{Float64}(undef,n  ,n  )
+    A_new = ROCArray{Float64}(undef,n-2,n-2)
+    A     = ROCArray{Float64}(undef,n  ,n  )
     h     = 1/5
-    nthreads = (32,8)
+    nthreads = (128,2)
     nblocks  = cld.(size(A_new),nthreads)
 
     bm = @benchmark begin
-        CUDA.@sync @cuda blocks=$nblocks threads=$nthreads diffusion_kernel!($A_new,$A,$h)
+        @roc gridsize=$nblocks groupsize=$nthreads diffusion_kernel!($A_new,$A,$h)
+        AMDGPU.synchronize()
     end
 
-    CUDA.unsafe_free!(A_new)
-    CUDA.unsafe_free!(A)
+    AMDGPU.unsafe_free!(A_new)
+    AMDGPU.unsafe_free!(A)
 
     return bm
 end
 
 function run_c_benchmarks(lib,nsamples,n)
     trial = make_c_trial(nsamples)
 
-    CUDA.reclaim()
-
-    sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
+    sym = Libdl.dlsym(lib,:run_benchmark)
     @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
 
     return trial
 end
 
 group = BenchmarkGroup()
-group["julia"] = run_julia_benchmarks(N)
 
-# Add baseline C benchmark
+# Compile C benchmark
 libext  = Sys.iswindows() ? "dll" : "so"
 libname = "diffusion_2d." * libext
-run(`nvcc -O3 -o $libname --shared diffusion_2d.cu`)
+run(`hipcc -O3 -o $libname --shared -fPIC diffusion_2d.cu`)
+
 Libdl.dlopen("./$libname") do lib
-    group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N)
+    for N in INPUTS["diffusion-2d"].n_range
+        @info "N = $N"
+        group_n = BenchmarkGroup()
+        group_n["julia"]     = run_julia_benchmarks(N)
+        group_n["reference"] = run_c_benchmarks(lib,INPUTS["diffusion-2d"].c_samples,N)
+        group[N] = group_n
+        display(group_n)
+    end
 end
 
 RESULTS["diffusion-2d"] = group
-
diff --git a/AMDGPU/host_overhead.cu b/AMDGPU/host_overhead.cu
@@ -1,4 +1,5 @@
-#include <cuda.h>
+#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
 #include <stdint.h>
 
 #include <chrono>
@@ -20,17 +21,17 @@ __global__ void sleep_kernel(const int64_t ncycles) {
 
 extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
                                          const int64_t ncycles) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
 
   for (int isample = 0; isample < nsamples; ++isample) {
     auto timer = high_resolution_clock::now();
-    sleep_kernel<<<1, 1, 0, stream>>>(ncycles);
-    cudaStreamSynchronize(stream);
+    hipLaunchKernelGGL(sleep_kernel, 1, 1, 0, stream, ncycles);
+    hipStreamSynchronize(stream);
     auto elapsed = high_resolution_clock::now() - timer;
     auto time_total = duration_cast<nano_double>(elapsed).count();
     times[isample] = time_total;
   }
 
-  cudaStreamDestroy(stream);
+  hipStreamDestroy(stream);
 }
diff --git a/AMDGPU/host_overhead.jl b/AMDGPU/host_overhead.jl
@@ -29,9 +29,6 @@ end
 function run_c_benchmarks(lib,nsamples,ncycles)
     trial = make_c_trial(nsamples)
 
-    CUDA.reclaim()
-
-    sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
     @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,ncycles::Cint)::Cvoid
 
     return trial
@@ -45,7 +42,7 @@ group = run_julia_benchmarks(ncycles)
 # Add baseline C benchmark
 libext  = Sys.iswindows() ? "dll" : "so"
 libname = "host_overhead." * libext
-run(`nvcc -O3 -o $libname --shared host_overhead.cu`)
+run(`hipcc -O3 -o $libname --shared -fPIC host_overhead.cu`)
 Libdl.dlopen("./$libname") do lib
     group["reference"] = run_c_benchmarks(lib,C_SAMPLES,ncycles)
 end
diff --git a/AMDGPU/memcopy.cu b/AMDGPU/memcopy.cu
@@ -1,4 +1,5 @@
-#include <cuda.h>
+#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
 #include <stdint.h>
 
 #include <chrono>
@@ -11,7 +12,7 @@ using nano_double = duration<double, std::nano>;
 #define EXPORT_API
 #endif
 
-__global__ void memcopy_kernel(uint8_t *dst, const uint8_t *src, const int n) {
+__global__ void memcopy_kernel(double *dst, const double *src, const int n) {
   int ix = blockIdx.x * blockDim.x + threadIdx.x;
   if (ix < n) {
     dst[ix] = src[ix];
@@ -20,27 +21,27 @@ __global__ void memcopy_kernel(uint8_t *dst, const uint8_t *src, const int n) {
 
 extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples,
                                          const int n) {
-  uint8_t *dst, *src;
-  cudaMalloc(&dst, n);
-  cudaMalloc(&src, n);
+  double *dst, *src;
+  hipMalloc(&dst, n * sizeof(double));
+  hipMalloc(&src, n * sizeof(double));
 
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
 
   int nthreads = 256;
   int nblocks = (n + nthreads - 1) / nthreads;
 
   for (int isample = 0; isample < nsamples; ++isample) {
     auto timer = high_resolution_clock::now();
-    memcopy_kernel<<<nblocks, nthreads, 0, stream>>>(dst, src, n);
-    cudaStreamSynchronize(stream);
+    hipLaunchKernelGGL(memcopy_kernel, nblocks, nthreads, 0, stream, dst, src, n);
+    hipStreamSynchronize(stream);
     auto elapsed = high_resolution_clock::now() - timer;
     auto time_total = duration_cast<nano_double>(elapsed).count();
     times[isample] = time_total;
   }
 
-  cudaFree(src);
-  cudaFree(dst);
+  hipFree(src);
+  hipFree(dst);
 
-  cudaStreamDestroy(stream);
+  hipStreamDestroy(stream);
 }
diff --git a/AMDGPU/memcopy.jl b/AMDGPU/memcopy.jl
@@ -1,50 +1,58 @@
-N_BYTES   = 10^8
-C_SAMPLES = 500
+INPUTS["memcopy"] = (
+    n_range   = 2 .^ (16:2:28),
+    c_samples = 2000,
+)
 
 function memcopy_kernel!(dst,src)
-    ix = (blockIdx().x-1i32)*blockDim().x + threadIdx().x
+    ix = (workgroupIdx().x-1)*workgroupDim().x + workitemIdx().x
     if ix <= length(dst)
         @inbounds dst[ix] = src[ix]
     end
     return
 end
 
-function run_julia_benchmarks(nbytes)
-    dst = CuArray{UInt8}(undef,nbytes)
-    src = CuArray{UInt8}(undef,nbytes)
+function run_julia_benchmarks(n)
+    dst      = ROCArray{Float64}(undef,n)
+    src      = ROCArray{Float64}(undef,n)
     nthreads = 256
     nblocks  = cld(length(dst),nthreads)
 
     bm = @benchmark begin
-        CUDA.@sync @cuda blocks=$nblocks threads=$nthreads memcopy_kernel!($dst,$src)
+        @roc gridsize=$nblocks groupsize=$nthreads memcopy_kernel!($dst,$src)
+        AMDGPU.synchronize()
     end
 
-    CUDA.unsafe_free!(dst)
-    CUDA.unsafe_free!(src)
+    AMDGPU.unsafe_free!(dst)
+    AMDGPU.unsafe_free!(src)
 
     return bm
 end
 
-function run_c_benchmarks(lib,nsamples,nbytes)
+function run_c_benchmarks(lib,nsamples,n)
     trial = make_c_trial(nsamples)
 
-    CUDA.reclaim()
-
-    sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
-    @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,nbytes::Cint)::Cvoid
+    sym = Libdl.dlsym(lib,:run_benchmark)
+    @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
 
     return trial
 end
 
 group = BenchmarkGroup()
-group["julia"] = run_julia_benchmarks(N_BYTES)
 
-# Add baseline C benchmark
+# Compile C benchmark
 libext  = Sys.iswindows() ? "dll" : "so"
 libname = "memcopy." * libext
-run(`nvcc -O3 -o $libname --shared memcopy.cu`)
+run(`hipcc -O3 -o $libname --shared -fPIC memcopy.cu`)
+
 Libdl.dlopen("./$libname") do lib
-    group["reference"] = run_c_benchmarks(lib,C_SAMPLES,N_BYTES)
+    for N in INPUTS["memcopy"].n_range
+        @info "N = $N"
+        group_n = BenchmarkGroup()
+        group_n["julia"]     = run_julia_benchmarks(N)
+        group_n["reference"] = run_c_benchmarks(lib,INPUTS["memcopy"].c_samples,N)
+        group[N] = group_n
+        display(group_n)
+    end
 end
 
 RESULTS["memcopy"] = group
diff --git a/AMDGPU/runbenchmarks.jl b/AMDGPU/runbenchmarks.jl
@@ -1,12 +1,3 @@
-using CUDA
-import CUDA: i32
-
-using BenchmarkTools
-using Statistics
-using Libdl
-
-RESULTS = BenchmarkGroup()
-
 include("common.jl")
 
 @info "host overhead"
@@ -16,4 +7,15 @@ include("host_overhead.jl")
 include("memcopy.jl")
 
 @info "diffusion"
-include("diffusion_2d.jl")
+include("diffusion_2d.jl")
+
+abstract type HPCBenchmark end
+
+_BENCHMARKS = Dict{Symbol, HPCBenchmark}
+
+
+function runbenchmarks(benchmarks=:all)
+    if benchmarks == :all
+        benchmarks = collect(keys(_BENCHMARKS))
+    end
+end
diff --git a/AMDGPU/setup_env.sh b/AMDGPU/setup_env.sh
@@ -0,0 +1,3 @@
+module load LUMI/22.08
+module load partition/G
+module load rocm/5.3.3
diff --git a/CUDA/diffusion_2d.jl b/CUDA/diffusion_2d.jl
@@ -32,7 +32,7 @@ function run_c_benchmarks(lib,nsamples,n)
 
     CUDA.reclaim()
 
-    sym = CUDA.Libdl.dlsym(lib,:run_benchmark)
+    sym = Libdl.dlsym(lib,:run_benchmark)
     @ccall $sym(trial.times::Ptr{Cdouble},nsamples::Cint,n::Cint)::Cvoid
 
     return trial
diff --git a/CUDA/host_overhead.jl b/CUDA/host_overhead.jl
diff --git a/CUDA/memcopy.jl b/CUDA/memcopy.jl

-Original file line number
+Diff line change
 *.dll
 *.exp
 -*.lib
 +*.lib
 +*.so
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+module load LUMI/22.08`
	`2`	`+module load partition/G`
	`3`	`+module load rocm/5.3.3`