diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 0000000..a420e6f --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,65 @@ +env: + SECRET_CODECOV_TOKEN: "EEXB5DS9rR3VXck1NzJougBwxy+3bGKAX9sq1hTwe+rvftmQzdnpy3MlJXLUXQXnBvjezhHZpt07nlG1p9Pi39bnUIddPJHJVVbtqjiGbVuAjVno2tcm8cvi/mYDPoJw7hs8G36IVDb3wklO9wAiO7vwO2br8LQOHMNZBTCUfkb30aT3e/yBnb2QiwNspKCvcd7XYpsmMy78Egdg219sfZ783fG/H7VHv0YzZThj+IAUhm8ftsPURHRmHk28wSdFGzwI2CX8nEx4LgtDhqa+JH84YajIiwWaFymfkw6phpSF3KQNlR53qRWUDD6hClhOizmYyQuZZ8TO5gnNDsrGLg==;U2FsdGVkX1/pfvZY/FJSU7D+DE+6I18s5BSfa63C+31RoDKiHqENegG4whXuxZ5a6YE0XegF8jOretp+E7FiyQ==" + +steps: + - label: "Julia v1 -- CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia LTS -- CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1.10" # "lts" isn't valid + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia v1 -- AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia LTS -- AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1.10" # "lts" isn't valid + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22e2e79..5469526 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,6 @@ name: CI on: push: branches: - - 'master' - 'main' - 'release-' tags: '*' @@ -21,7 +20,6 @@ jobs: fail-fast: false matrix: version: - - '1.6' # previous LTS release - 'lts' # current LTS release - '1' # current stable release os: @@ -45,4 +43,4 @@ jobs: - uses: codecov/codecov-action@v5 with: file: lcov.info - \ No newline at end of file + diff --git a/Project.toml b/Project.toml index f00886f..55aca8e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,26 +1,42 @@ name = "Strided" uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" -authors = ["Lukas Devos ", "Maarten Van Damme ", "Jutho Haegeman "] version = "2.3.2" +authors = ["Lukas Devos ", "Maarten Van Damme ", "Jutho Haegeman "] [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" +[weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +StridedAMDGPUExt = "AMDGPU" +StridedGPUArraysExt = "GPUArrays" +StridedCUDAExt = "CUDA" + [compat] +AMDGPU = "2" Aqua = "0.8" +CUDA = "5" +GPUArrays = "11.4.1" LinearAlgebra = "1.6" Random = "1.6" -StridedViews = "0.3.2,0.4" +StridedViews = "0.4.5" Test = "1.6" TupleTools = "1.6" julia = "1.6" [extras] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua"] +test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays"] diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl new file mode 100644 index 0000000..6fa3c40 --- /dev/null +++ b/ext/StridedAMDGPUExt.jl @@ -0,0 +1,16 @@ +module StridedAMDGPUExt + +using Strided, StridedViews, AMDGPU +using AMDGPU: Adapt +using AMDGPU: GPUArrays + +const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: ROCArray{TS}, FS <: ALL_FS} + bc_style = Base.Broadcast.BroadcastStyle(TAS) + bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) + GPUArrays._copyto!(dst, bc) + return dst +end + +end diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl new file mode 100644 index 0000000..ec0abfd --- /dev/null +++ b/ext/StridedCUDAExt.jl @@ -0,0 +1,16 @@ +module StridedCUDAExt + +using Strided, StridedViews, CUDA +using CUDA: Adapt, KernelAdaptor +using CUDA: GPUArrays + +const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: CuArray{TS}, FS <: ALL_FS} + bc_style = Base.Broadcast.BroadcastStyle(TAS) + bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) + GPUArrays._copyto!(dst, bc) + return dst +end + +end diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl new file mode 100644 index 0000000..409ad5e --- /dev/null +++ b/ext/StridedGPUArraysExt.jl @@ -0,0 +1,15 @@ +module StridedGPUArraysExt + +using Strided, GPUArrays +using GPUArrays: Adapt, KernelAbstractions + +ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv)) + +function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} + raw_style = Base.Broadcast.BroadcastStyle(TA) + return typeof(raw_style)(Val(N)) # sets the dimensionality correctly +end + +end diff --git a/test/amd.jl b/test/amd.jl new file mode 100644 index 0000000..fc77c49 --- /dev/null +++ b/test/amd.jl @@ -0,0 +1,18 @@ +for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) + @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) + for m1 in (0, 16, 32), m2 in (0, 16, 32) + if iszero(m1 * m2) + A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2)) + else + A1 = ROCMatrix(randn(T, (m1, m2))) + end + A2 = similar(A1) + A1c = copy(A1) + A2c = copy(A2) + B1 = f1(StridedView(A1c)) + B2 = f2(StridedView(A2c)) + axes(f1(A1)) == axes(f2(A2)) || continue + @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == AMDGPU.Adapt.adapt(Vector{T}, copy!(B2, B1)) + end + end +end diff --git a/test/cuda.jl b/test/cuda.jl new file mode 100644 index 0000000..695fec9 --- /dev/null +++ b/test/cuda.jl @@ -0,0 +1,14 @@ +for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) + @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) + for m1 in (0, 16, 32), m2 in (0, 16, 32) + A1 = CUDA.randn(T, (m1, m2)) + A2 = similar(A1) + A1c = copy(A1) + A2c = copy(A2) + B1 = f1(StridedView(A1c)) + B2 = f2(StridedView(A2c)) + axes(f1(A1)) == axes(f2(A2)) || continue + @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1)) + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index fc411cc..3f9ee6f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,25 +3,38 @@ using LinearAlgebra using Random using Strided using Strided: StridedView +using Aqua +using AMDGPU, CUDA, GPUArrays Random.seed!(1234) -println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") +is_buildkite = get(ENV, "BUILDKITE", "false") == "true" -println("Running tests single-threaded:") -Strided.disable_threads() -include("othertests.jl") -include("blasmultests.jl") +if !is_buildkite + println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") -println("Running tests multi-threaded:") -Strided.enable_threads() -Strided.set_num_threads(Base.Threads.nthreads() + 1) -include("othertests.jl") -include("blasmultests.jl") + println("Running tests single-threaded:") + Strided.disable_threads() + include("othertests.jl") + include("blasmultests.jl") -Strided.enable_threaded_mul() -include("blasmultests.jl") -Strided.disable_threaded_mul() + println("Running tests multi-threaded:") + Strided.enable_threads() + Strided.set_num_threads(Base.Threads.nthreads() + 1) + include("othertests.jl") + include("blasmultests.jl") -using Aqua -Aqua.test_all(Strided; piracies = false) + Strided.enable_threaded_mul() + include("blasmultests.jl") + Strided.disable_threaded_mul() + + Aqua.test_all(Strided; piracies = false) +end + +if CUDA.functional() + include("cuda.jl") +end + +if AMDGPU.functional() + include("amd.jl") +end