diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..a420e6f
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,65 @@
+env:
+  SECRET_CODECOV_TOKEN: "EEXB5DS9rR3VXck1NzJougBwxy+3bGKAX9sq1hTwe+rvftmQzdnpy3MlJXLUXQXnBvjezhHZpt07nlG1p9Pi39bnUIddPJHJVVbtqjiGbVuAjVno2tcm8cvi/mYDPoJw7hs8G36IVDb3wklO9wAiO7vwO2br8LQOHMNZBTCUfkb30aT3e/yBnb2QiwNspKCvcd7XYpsmMy78Egdg219sfZ783fG/H7VHv0YzZThj+IAUhm8ftsPURHRmHk28wSdFGzwI2CX8nEx4LgtDhqa+JH84YajIiwWaFymfkw6phpSF3KQNlR53qRWUDD6hClhOizmYyQuZZ8TO5gnNDsrGLg==;U2FsdGVkX1/pfvZY/FJSU7D+DE+6I18s5BSfa63C+31RoDKiHqENegG4whXuxZ5a6YE0XegF8jOretp+E7FiyQ=="
+
+steps:
+  - label: "Julia v1 -- CUDA"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+  - label: "Julia LTS -- CUDA"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+  
+  - label: "Julia v1 -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+  - label: "Julia LTS -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22e2e79..5469526 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,6 @@ name: CI
 on:
   push:
     branches:
-      - 'master'
       - 'main'
       - 'release-'
     tags: '*'
@@ -21,7 +20,6 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6' # previous LTS release
           - 'lts' # current LTS release
           - '1' # current stable release
         os:
@@ -45,4 +43,4 @@ jobs:
       - uses: codecov/codecov-action@v5
         with:
           file: lcov.info
-        
\ No newline at end of file
+        
diff --git a/Project.toml b/Project.toml
index f00886f..55aca8e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,26 +1,42 @@
 name = "Strided"
 uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-authors = ["Lukas Devos <lukas.devos@ugent.be>", "Maarten Van Damme <maartenvd1994@gmail.com>", "Jutho Haegeman <jutho.haegeman@ugent.be>"]
 version = "2.3.2"
+authors = ["Lukas Devos <lukas.devos@ugent.be>", "Maarten Van Damme <maartenvd1994@gmail.com>", "Jutho Haegeman <jutho.haegeman@ugent.be>"]
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+StridedAMDGPUExt = "AMDGPU"
+StridedGPUArraysExt = "GPUArrays"
+StridedCUDAExt = "CUDA"
+
 [compat]
+AMDGPU = "2"
 Aqua = "0.8"
+CUDA = "5"
+GPUArrays = "11.4.1"
 LinearAlgebra = "1.6"
 Random = "1.6"
-StridedViews = "0.3.2,0.4"
+StridedViews = "0.4.5"
 Test = "1.6"
 TupleTools = "1.6"
 julia = "1.6"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays"]
diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl
new file mode 100644
index 0000000..6fa3c40
--- /dev/null
+++ b/ext/StridedAMDGPUExt.jl
@@ -0,0 +1,16 @@
+module StridedAMDGPUExt
+
+using Strided, StridedViews, AMDGPU
+using AMDGPU: Adapt
+using AMDGPU: GPUArrays
+
+const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: ROCArray{TS}, FS <: ALL_FS}
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
+    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
+    GPUArrays._copyto!(dst, bc)
+    return dst
+end
+
+end
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
new file mode 100644
index 0000000..ec0abfd
--- /dev/null
+++ b/ext/StridedCUDAExt.jl
@@ -0,0 +1,16 @@
+module StridedCUDAExt
+
+using Strided, StridedViews, CUDA
+using CUDA: Adapt, KernelAdaptor
+using CUDA: GPUArrays
+
+const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
+    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
+    GPUArrays._copyto!(dst, bc)
+    return dst
+end
+
+end
diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
new file mode 100644
index 0000000..409ad5e
--- /dev/null
+++ b/ext/StridedGPUArraysExt.jl
@@ -0,0 +1,15 @@
+module StridedGPUArraysExt
+
+using Strided, GPUArrays
+using GPUArrays: Adapt, KernelAbstractions
+
+ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv))
+
+function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}}
+    raw_style = Base.Broadcast.BroadcastStyle(TA)
+    return typeof(raw_style)(Val(N)) # sets the dimensionality correctly
+end
+
+end
diff --git a/test/amd.jl b/test/amd.jl
new file mode 100644
index 0000000..fc77c49
--- /dev/null
+++ b/test/amd.jl
@@ -0,0 +1,18 @@
+for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            if iszero(m1 * m2)
+                A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2))
+            else
+                A1 = ROCMatrix(randn(T, (m1, m2)))
+            end
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == AMDGPU.Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
+    end
+end
diff --git a/test/cuda.jl b/test/cuda.jl
new file mode 100644
index 0000000..695fec9
--- /dev/null
+++ b/test/cuda.jl
@@ -0,0 +1,14 @@
+for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            A1 = CUDA.randn(T, (m1, m2))
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index fc411cc..3f9ee6f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,25 +3,38 @@ using LinearAlgebra
 using Random
 using Strided
 using Strided: StridedView
+using Aqua
+using AMDGPU, CUDA, GPUArrays
 
 Random.seed!(1234)
 
-println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
+is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
-println("Running tests single-threaded:")
-Strided.disable_threads()
-include("othertests.jl")
-include("blasmultests.jl")
+if !is_buildkite
+    println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 
-println("Running tests multi-threaded:")
-Strided.enable_threads()
-Strided.set_num_threads(Base.Threads.nthreads() + 1)
-include("othertests.jl")
-include("blasmultests.jl")
+    println("Running tests single-threaded:")
+    Strided.disable_threads()
+    include("othertests.jl")
+    include("blasmultests.jl")
 
-Strided.enable_threaded_mul()
-include("blasmultests.jl")
-Strided.disable_threaded_mul()
+    println("Running tests multi-threaded:")
+    Strided.enable_threads()
+    Strided.set_num_threads(Base.Threads.nthreads() + 1)
+    include("othertests.jl")
+    include("blasmultests.jl")
 
-using Aqua
-Aqua.test_all(Strided; piracies = false)
+    Strided.enable_threaded_mul()
+    include("blasmultests.jl")
+    Strided.disable_threaded_mul()
+
+    Aqua.test_all(Strided; piracies = false)
+end
+
+if CUDA.functional()
+    include("cuda.jl")
+end
+
+if AMDGPU.functional()
+    include("amd.jl")
+end