diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
index 2dba8286bb..d482841e27 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -5,3 +5,16 @@ function Base.permutedims!(
   copyto!(expose(parent(Edest)), expose(Aperm))
   return unexpose(Edest)
 end
+
+## Found an issue in CUDA where if Edest is a reshaped{<:Adjoint}
+## .= can fail. So instead force Esrc into the shape of parent(Edest)
+function Base.permutedims!(
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
+  Esrc::Exposed{<:CuArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index 7c388a0fc8..2abf8e8cbf 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -19,3 +19,22 @@ function LinearAlgebra.mul!(
   mul!(CM', BM', AM', α, β)
   return unexpose(CM)
 end
+
+## Fix issue in Metal.jl where it cannot distinguish Transpose{Reshape{Adjoint{MtlArray}}}
+## as a MtlArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:MtlArray},
+  AM::Exposed{<:MtlArray},
+  BM::Exposed{
+    <:MtlArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  B = copy(expose(parent(BM)))
+  mul!(CM, AM, expose(transpose(B)), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index ad29610527..40cc3f588c 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -1,3 +1,11 @@
+## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted using
+## permutedims (failing in that Metal uses scalar indexing)
+## These functions are to address the problem in different instances of permutedims
+function Base.permutedims(E::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm)
+  A = copy(E)
+  return permutedims(A, perm)
+end
+
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray,<:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray}, perm
 )
@@ -5,3 +13,25 @@ function Base.permutedims!(
   copyto!(expose(parent(Edest)), expose(Aperm))
   return unexpose(Edest)
 end
+
+function Base.permutedims!(
+  Edest::Exposed{<:MtlArray}, Esrc::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(Edest, expose(Aperm))
+  return unexpose(Edest)
+end
+
+## To get around the Metal issue here we copy and permute Esrc,
+## then we reshape Esrc to the size of Edest's parent
+## and broadcast into the parent.
+function Base.permutedims!(
+  Edest::Exposed{<:MtlArray,<:Base.ReshapedArray},
+  Esrc::Exposed{<:MtlArray,<:Base.ReshapedArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 6a9efe749f..c78efa4416 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -1,13 +1,14 @@
-using CUDA
 using NDTensors
-
-using ITensors
-using Test
-
-using Zygote
+using CUDA: CUDA, CuVector, cu, reshape
+using ITensors:
+  Index, ITensor, randomMPO, randomMPS, inner, orthogonalize, qr, siteinds, svd
+using Test: @test
+using Zygote: gradient
 
 function main()
   # using ITensorGPU
+  cpu = NDTensors.cpu
+  gpu = NDTensors.cu
   # Here is an example of how to utilize NDTensors based tensors with CUDA datatypes
   i = Index(2)
   j = Index(5)
@@ -18,10 +19,9 @@ function main()
   dim2 = (j, k)
 
   # Create  2 ITensors with CUDA backends (These will be made simpiler by randomITensor(CuVector) soon)
-  A = ITensor(NDTensors.generic_randn(CuVector, dim(dim1)), dim1)
-  B = ITensor(NDTensors.generic_randn(CuVector, dim(dim2)), dim2)
+  A = ITensor(randomTensor(CuVector, dim1))
+  B = ITensor(randomTensor(CuVector, dim2))
   # Contract the two tensors
-  cpu = NDTensors.cpu
   C = A * B
   A = cpu(A)
   B = cpu(B)
@@ -36,8 +36,8 @@ function main()
   fill!(B, randn())
 
   # Convert the ITensors to GPU
-  cA = NDTensors.cu(A)
-  cB = NDTensors.cu(B)
+  cA = gpu(A)
+  cB = gpu(B)
 
   #Check that backend of contraction is GPU
   @test A * A ≈ cpu(cA * cA)
@@ -47,11 +47,8 @@ function main()
 
   dim3 = (l, k)
   dim4 = (i,)
-  cC = ITensor(
-    NDTensors.generic_randn(CuVector{Float64,CUDA.Mem.DeviceBuffer}, dim(dim3)), dim3
-  )
-  cC = NDTensors.cu(ITensor(NDTensors.generic_randn(Vector{Float64}, dim(dim3)), dim3))
-  cD = ITensor(Tensor(CuVector, dim4))
+  cC = ITensor(randomTensor(CuVector{Float64,CUDA.Mem.DeviceBuffer}, dim3))
+  cD = ITensor(Tensor(CuVector{Float32}, dim4))
   fill!(cD, randn())
 
   # Create a function of 4 tensors on GPU
@@ -61,20 +58,18 @@ function main()
   #Currently this code fails with CUDA.allowscalar(false)
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
-  @allowscalar grad = gradient(f, cA, cB, cC, cD)
-  @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
-  @allowscalar @test (cB * cC * cD) ≈ grad[1]
+  grad = gradient(f, cA, cB, cC, cD)
+  @test cpu(cB * cC * cD) ≈ cpu(grad[1])
+  @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
-  decomp = (
-    dim(NDTensors.ind(grad[1], 1)),
-    dim(NDTensors.ind(grad[1], 2)) * dim(NDTensors.ind(grad[1], 3)),
-  )
+  dims = size(grad[1])
+  decomp = (dims[1], dims[2] * dims[3])
   # Reshape the CuVector of data into a matrix
-  cuTensor_data = CUDA.reshape(NDTensors.data(storage(grad[1])), decomp)
+  cuTensor_data = reshape(array(grad[1]), decomp)
   # Use cuBLAS to compute SVD of data
   U, S, V = svd(cuTensor_data)
-  decomp = (dim(NDTensors.ind(grad[2], 1)), dim(NDTensors.ind(grad[2], 2)))
-  cuTensor_data = CUDA.reshape(NDTensors.data(storage(grad[2])), decomp)
+  decomp = size(array(grad[2]))
+  cuTensor_data = reshape(array(grad[2]), decomp)
   U, S, V = svd(cuTensor_data)
 
   # These things can take up lots of memory, look at memory usage here
@@ -87,8 +82,7 @@ function main()
   CUDA.memory_status()
 
   # Its possible to compute QR of GPU tensor
-  cq = ITensors.qr(cA, (i,), (j, l))
-  q = ITensors.qr(A, (i,), (j, l))
+  cq = qr(cA, (i,), (j, l))
   A ≈ cpu(cq[1]) * cpu(cq[2])
 
   ## SVD does not yet work with CUDA backend, see above on
@@ -96,24 +90,25 @@ function main()
   ## CuVectors...
   #ITensors.svd(A, (i,), (j, l))
 
-  s = ITensors.siteinds("S=1/2", 8)
+  s = siteinds("S=1/2", 8)
   m = randomMPS(s; linkdims=4)
-  cm = NDTensors.cu(m)
+  cm = gpu(m)
 
   @test inner(cm', cm) ≈ inner(m', m)
 
   H = randomMPO(s)
-  cH = NDTensors.cu(H)
+  cH = gpu(H)
   @test inner(cm', cH, cm) ≈ inner(m', H, m)
 
   m = orthogonalize(m, 1)
-  cm = NDTensors.cu(orthogonalize(cm, 1))
+  cm = gpu(orthogonalize(cm, 1))
   @test inner(m', m) ≈ inner(cm', cm)
 
   H = orthogonalize(H, 1)
-  cH = NDTensors.cu(cH)
+  cH = gpu(cH)
 
   @test inner(cm', cH, cm) ≈ inner(m', H, m)
 end
 
+## running the main function with Float64
 main()
diff --git a/NDTensors/ext/examples/NDTensorMetal.jl b/NDTensors/ext/examples/NDTensorMetal.jl
index 6f670b3083..95f0445f6c 100644
--- a/NDTensors/ext/examples/NDTensorMetal.jl
+++ b/NDTensors/ext/examples/NDTensorMetal.jl
@@ -1,11 +1,13 @@
-using Metal
+using Metal: MtlVector, mtl
 using NDTensors
 
-using ITensors
-using Test
-using Zygote
+using ITensors: ITensor, Index, randomITensor
+using Test: @test
+using Zygote: gradient
 
 function main()
+  cpu = NDTensors.cpu
+  gpu = NDTensors.mtl
   # Here is an example of how to utilize NDTensors based tensors with CUDA datatypes
   i = Index(20)
   j = Index(5)
@@ -15,27 +17,26 @@ function main()
   dim1 = (i, j, l)
   dim2 = (j, k)
 
-  cA = ITensor(NDTensors.generic_randn(MtlVector{Float32}, dim(dim1)), dim1)
-  cB = ITensor(NDTensors.generic_randn(MtlVector{Float32}, dim(dim2)), dim2)
+  ## MtlArrays only support Float32 arithmatic
+  cA = ITensor(randomTensor(MtlVector{Float32}, dim1))
+  cB = ITensor(randomTensor(MtlVector{Float32}, dim2))
   cC = cA * cB
 
-  cpu = NDTensors.cpu
   A = cpu(cA)
   B = cpu(cB)
 
   @test A * B ≈ cpu(cC)
 
-  #C = A * B
-
   dim3 = (l, k)
   dim4 = (i,)
 
-  cC = mtl(randomITensor(Float32, dim3))
-  cD = mtl(randomITensor(Float32, dim4))
+  cC = gpu(randomITensor(Float32, dim3))
+  cD = gpu(randomITensor(Float32, dim4))
 
   f(A, B, C, D) = (A * B * C * D)[]
 
-  return grad = gradient(f, cA, cB, cC, cD)
+  grad = gradient(f, cA, cB, cC, cD)
+  @test grad[2] ≈ cA * cC * cD
 end
 
 main()
diff --git a/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl b/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
index c0d4db7654..8e6a4a2cee 100644
--- a/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
+++ b/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
@@ -49,7 +49,7 @@ function permutedims_combine(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
   comb_ind_loc = minimum(combdims_perm)
 
   # Determine the new index before combining
@@ -117,7 +117,7 @@ function permutedims_combine_output(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
 
   # Permute the nonzero blocks (dimension-wise)
   blocks = nzblocks(a_src)
diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 1ef7100ad5..db2b6d85e9 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -49,7 +49,7 @@ Construct a block sparse tensor with uninitialized memory
 from indices and locations of non-zero blocks.
 """
 function BlockSparseTensor(::UndefInitializer, blockoffsets, inds)
-  return BlockSparseTensor(Float64, undef, blockoffsets, inds)
+  return BlockSparseTensor(default_eltype(), undef, blockoffsets, inds)
 end
 
 function BlockSparseTensor(
@@ -65,7 +65,7 @@ function BlockSparseTensor(eltype::Type{<:Number}, blockoffsets::BlockOffsets, i
 end
 
 function BlockSparseTensor(blockoffsets::BlockOffsets, inds)
-  return BlockSparseTensor(Float64, blockoffsets, inds)
+  return BlockSparseTensor(default_eltype(), blockoffsets, inds)
 end
 
 """
@@ -73,7 +73,7 @@ end
 
 Construct a block sparse tensor with no blocks.
 """
-BlockSparseTensor(inds) = BlockSparseTensor(Float64, inds)
+BlockSparseTensor(inds) = BlockSparseTensor(default_eltype(), inds)
 
 function BlockSparseTensor(datatype::Type{<:AbstractArray}, inds)
   return BlockSparseTensor(datatype, BlockOffsets{length(inds)}(), inds)
@@ -99,7 +99,7 @@ Construct a block sparse tensor with the specified blocks.
 Defaults to setting structurally non-zero blocks to zero.
 """
 function BlockSparseTensor(blocks::Vector{BlockT}, inds) where {BlockT<:Union{Block,NTuple}}
-  return BlockSparseTensor(Float64, blocks, inds)
+  return BlockSparseTensor(default_eltype(), blocks, inds)
 end
 
 function BlockSparseTensor(
@@ -160,7 +160,7 @@ function randomBlockSparseTensor(blocks::Vector, inds)
 end
 
 function randomBlockSparseTensor(rng::AbstractRNG, blocks::Vector, inds)
-  return randomBlockSparseTensor(rng, Float64, blocks, inds)
+  return randomBlockSparseTensor(rng, default_eltype(), blocks, inds)
 end
 
 """
@@ -176,6 +176,12 @@ function BlockSparseTensor(
   return BlockSparseTensor(blocks, inds)
 end
 
+function BlockSparseTensor{ElT}(
+  blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
+) where {ElT<:Number,BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}
+  return BlockSparseTensor(ElT, blocks, inds)
+end
+
 function zeros(
   tensor::BlockSparseTensor{ElT,N}, blockoffsets::BlockOffsets{N}, inds
 ) where {ElT,N}
@@ -440,7 +446,7 @@ function permutedims_combine_output(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
 
   # Permute the nonzero blocks (dimension-wise)
   blocks = nzblocks(T)
@@ -475,7 +481,7 @@ function permutedims_combine(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
   comb_ind_loc = minimum(combdims_perm)
 
   # Determine the new index before combining
diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
index 591b3bc160..1e78fda1fb 100644
--- a/NDTensors/src/dense/tensoralgebra/outer.jl
+++ b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -21,12 +21,9 @@ function outer!(
   v1 = data(T1)
   v2 = data(T2)
   RM = reshape(R, length(v1), length(v2))
-  ## Potential fix is call reshape on array
-  #RM = reshape(array(R), length(v1), length(v2))
-  #RM .= v1 .* transpose(v2)
-  #mul!(RM, v1, transpose(v2))
-  _gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), RM)
-  #mul!!(RM, v1, transpose(v2), one(ElR), zero(ElR))
+  ## There is no _gemm! defined for CUDA or Metal so it calls 
+  ## generic matmul. Replace with mul!! to call correct mul! (ger)
+  mul!!(array(RM), v1, transpose(v2), one(ElR), zero(ElR))
   return R
 end
 
diff --git a/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl b/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
index 862a504388..9f8bc253e3 100644
--- a/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
+++ b/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
@@ -1,3 +1,4 @@
+@eval module $(gensym())
 using Test: @test, @testset, @test_broken
 using BlockArrays: BlockArrays, BlockRange, blocksize
 using Compat: allequal
@@ -101,3 +102,4 @@ include("TestBlockSparseArraysUtils.jl")
     @test Hermitian(Matrix(a)) * Matrix(u) ≈ Matrix(u) * Diagonal(Vector(d))
   end
 end
+end
diff --git a/NDTensors/src/lib/SetParameters/test/runtests.jl b/NDTensors/src/lib/SetParameters/test/runtests.jl
index 646a983115..6c5c5d25e2 100644
--- a/NDTensors/src/lib/SetParameters/test/runtests.jl
+++ b/NDTensors/src/lib/SetParameters/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @inferred, @test, @testset
 using NDTensors.SetParameters
 
 @testset "Test NDTensors.SetParameters" begin
@@ -152,3 +153,4 @@ using NDTensors.SetParameters
       Array{Float64,1}
   end
 end
+end
diff --git a/NDTensors/src/lib/SmallVectors/test/runtests.jl b/NDTensors/src/lib/SmallVectors/test/runtests.jl
index 4b1daac154..d86bfd1060 100644
--- a/NDTensors/src/lib/SmallVectors/test/runtests.jl
+++ b/NDTensors/src/lib/SmallVectors/test/runtests.jl
@@ -1,5 +1,5 @@
 using NDTensors.SmallVectors
-using Test
+using Test: @inferred, @test, @testset, @test_broken
 
 using NDTensors.SmallVectors:
   setindex,
diff --git a/NDTensors/src/lib/SortedSets/test/runtests.jl b/NDTensors/src/lib/SortedSets/test/runtests.jl
index 14061d17dd..882fa15e18 100644
--- a/NDTensors/src/lib/SortedSets/test/runtests.jl
+++ b/NDTensors/src/lib/SortedSets/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @test, @testset
 using NDTensors.SortedSets
 using NDTensors.SmallVectors
 
@@ -38,3 +39,4 @@ using NDTensors.SmallVectors
     @test ("a", 3) ∉ parent(s)
   end
 end
+end
diff --git a/NDTensors/src/lib/TagSets/test/runtests.jl b/NDTensors/src/lib/TagSets/test/runtests.jl
index cf7336bbc1..ee5c72a199 100644
--- a/NDTensors/src/lib/TagSets/test/runtests.jl
+++ b/NDTensors/src/lib/TagSets/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @test, @testset
 using NDTensors.TagSets
 using NDTensors.SortedSets
 using NDTensors.SmallVectors
@@ -31,3 +32,4 @@ using NDTensors.Dictionaries
     end
   end
 end
+end
diff --git a/NDTensors/src/lib/TensorAlgebra/test/runtests.jl b/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
index d5182591a8..21ae8f3fb5 100644
--- a/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
+++ b/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
@@ -1,3 +1,4 @@
+@eval module $(gensym())
 using Combinatorics: permutations
 using LinearAlgebra: qr
 using NDTensors.TensorAlgebra: TensorAlgebra
@@ -49,3 +50,4 @@ using Test: @test, @test_broken, @testset
     @test a ≈ a′
   end
 end
+end
diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index ec68afae03..0d1082601e 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -1,10 +1,23 @@
-using Test
+@eval module $(gensym())
+using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
-using NDTensors
-using LinearAlgebra
+using NDTensors: NDTensors, mul!!
+using LinearAlgebra:
+  LinearAlgebra,
+  Adjoint,
+  Diagonal,
+  Hermitian,
+  Symmetric,
+  Transpose,
+  eigen,
+  mul!,
+  norm,
+  qr,
+  svd
 using GPUArraysCore: @allowscalar
+include(joinpath(pkgdir(NDTensors), "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"))
+using .NDTensorsTestUtils: devices_list
 
-include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
 @testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
   elt in (Float32, ComplexF32)
 
@@ -18,8 +31,8 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   v_type = typeof(v)
   e_type = eltype(v)
   @test typeof(E) == Exposed{v_type,v_type}
-  @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
-  @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
+  @test typeof(Et) == Exposed{v_type,Transpose{e_type,v_type}}
+  @test typeof(Ea) == Exposed{v_type,Adjoint{e_type,v_type}}
 
   @test parent(E) == v
   @test parent(Et) == v
@@ -37,8 +50,8 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
 
   m_type = typeof(m)
   @test typeof(E) == Exposed{m_type,m_type}
-  @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
-  @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
+  @test typeof(Et) == Exposed{m_type,Transpose{e_type,m_type}}
+  @test typeof(Ea) == Exposed{m_type,Adjoint{e_type,m_type}}
 
   o = dev(randn(elt, 1))
   expose(o)[] = 2
@@ -114,8 +127,42 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   y = dev(rand(elt, 4, 4))
   x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
   copyto!(expose(y), expose(x))
-  @test NDTensors.cpu(y) == NDTensors.cpu(x)
-  @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
+  @test cpu(y) == cpu(x)
+  @test cpu(copy(expose(x))) == cpu(x)
+
+  ## Tests for Metal because permutedims with ReshapedArray does not work properly
+  ## transpose(ReshapedArray(MtlArray)) fails with scalar indexing so calling copy to 
+  ## evaluate tests in the following tests
+  y = dev(rand(elt, 4, 4))
+  @test permutedims(expose(y), (2, 1)) == transpose(y)
+  y = Base.ReshapedArray(y, (2, 8), ())
+  @test permutedims(expose(y), (2, 1)) == transpose(copy(expose(y)))
+  yt = dev(rand(elt, (8, 2)))
+  permutedims!(expose(y), expose(yt), (2, 1))
+  @test copy(expose(y)) == transpose(yt)
+  yt = dev(rand(elt, 8, 2))
+  permutedims!(expose(yt), expose(y), (2, 1))
+  @test copy(expose(y)) == transpose(yt)
+
+  y = reshape(dev(randn(elt, 8))', 2, 4)
+  x = Base.ReshapedArray(dev(randn(elt, 8, 8)'[1:8]), (2, 4), ())
+  z = dev(fill!(Matrix{elt}(undef, (2, 4)), 0.0))
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = y[i, j] * x[i, j]
+    end
+  end
+  permutedims!(expose(y), expose(x), (1, 2), *)
+  @allowscalar @test reshape(z, size(y)) ≈ y
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = x[i, j] * y[i, j]
+    end
+  end
+  permutedims!(expose(x), expose(y), (1, 2), *)
+  ## I copy x here because it is a ReshapedArray{SubArray} which causes `≈`
+  ## to throw an error 
+  @test z ≈ copy(expose(x))
 
   y = dev(rand(elt, 4, 4))
   x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
@@ -142,7 +189,7 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
   x = dev(randn(elt, 4, 4))
   permutedims!(expose(y), expose(x), (2, 1))
-  @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+  @test cpu(y) == transpose(cpu(x))
 
   ##########################################
   ### Testing an issue with CUDA&Metal transpose/adjoint mul
@@ -152,7 +199,7 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   Cp = copy(C)
 
   ## This fails with scalar indexing
-  if dev != NDTensors.cpu
+  if dev != cpu
     @test_broken mul!(transpose(C), transpose(A), B, true, false)
   end
   mul!(C, transpose(B), A, true, false)
@@ -160,19 +207,19 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   @test C ≈ Cp
   Cp = zero(C)
   ## Try calling mul!! with transposes to verify that code works
-  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  Cpt = mul!!(transpose(Cp), transpose(A), B, true, false)
   @test transpose(Cpt) ≈ C
 
   Cp = zero(C)
   ## This fails with scalar indexing 
-  if dev != NDTensors.cpu
+  if dev != cpu
     @test_broken mul!(C', A', B, true, false)
   end
   mul!(C, B', A, true, false)
   mul!(expose(Cp'), expose(A'), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
-  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  Cpt = mul!!(Cp', A', B, true, false)
   @test Cpt' ≈ C
 
   ##################################
@@ -181,13 +228,12 @@ include(joinpath(pkgdir(NDTensors), "test", "device_list.jl"))
   A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
   B = dev(randn(elt, 2, 2))
   C = dev(zeros(elt, 2, 12))
-  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
-  Cp = NDTensors.cpu(similar(C))
-  NDTensors.mul!(
-    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
-  )
-  @test NDTensors.cpu(C) ≈ Cp
-  NDTensors.zero(C)
-  NDTensors.mul!!(C, B, A, true, false)
-  @test NDTensors.cpu(C) ≈ Cp
+  mul!(expose(C), expose(B), expose(A), true, false)
+  Cp = cpu(similar(C))
+  mul!(expose(Cp), expose(cpu(B)), expose(cpu(A)), true, false)
+  @test cpu(C) ≈ Cp
+  zero(C)
+  mul!!(C, B, A, true, false)
+  @test cpu(C) ≈ Cp
+end
 end
diff --git a/NDTensors/src/tupletools.jl b/NDTensors/src/tupletools.jl
index 8e9874bfb5..395f0619f5 100644
--- a/NDTensors/src/tupletools.jl
+++ b/NDTensors/src/tupletools.jl
@@ -151,7 +151,7 @@ end
 
 deleteat(t::Tuple, I::Tuple{Int}) = deleteat(t, I[1])
 function deleteat(t::Tuple, I::Tuple{Int,Int,Vararg{Int}})
-  return deleteat_sorted(t, sort(I; rev=true))
+  return deleteat_sorted(t, TupleTools.sort(I; rev=true))
 end
 
 deleteat_sorted(t::Tuple, pos::Int64) = deleteat(t, pos[1])
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
index cf77be08cd..3e34f27009 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -10,14 +10,6 @@ reference_energies = Dict([
   (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
 ])
 
-default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
-
-is_supported_eltype(dev, elt::Type) = true
-is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{Float64}) = false
-function is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{<:Complex})
-  return is_supported_eltype(dev, real(elt))
-end
-
 is_broken(dev, elt::Type, conserve_qns::Val) = false
 is_broken(dev::typeof(NDTensors.cu), elt::Type, conserve_qns::Val{true}) = true
 
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
index 1ec22fdf86..d997c088db 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
@@ -1,6 +1,8 @@
 using ITensors: MPO, OpSum, dmrg, randomMPS, siteinds
 using Random: Random
 using Test: @test
+include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol
 # TODO: Include file with `reference_energies`.
 
 function test_dmrg(
@@ -28,5 +30,6 @@ function test_dmrg(
   maxdim = 32
 
   energy, psi = dmrg(H, psi0; nsweeps, cutoff, maxdim, noise, outputlevel)
+
   @test energy ≈ reference_energies[N] rtol = rtol_scale * default_rtol(elt)
 end
diff --git a/NDTensors/test/ITensors/runtests.jl b/NDTensors/test/ITensors/runtests.jl
index 93d2c2d46c..373906bb70 100644
--- a/NDTensors/test/ITensors/runtests.jl
+++ b/NDTensors/test/ITensors/runtests.jl
@@ -3,13 +3,14 @@ using SafeTestsets: @safetestset
 @safetestset "Downstream tests for ITensor DMRG" begin
   using Test: @testset
   include("TestITensorDMRG/TestITensorDMRG.jl")
-  include("../device_list.jl")
+  include("../NDTensorsTestUtils/NDTensorsTestUtils.jl")
+  using .NDTensorsTestUtils: devices_list, is_supported_eltype
   @testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in devices_list(ARGS),
     conserve_qns in [false, true],
     elt in (Float32, ComplexF32, Float64, ComplexF64),
     N in [4, 10]
 
-    if !TestITensorDMRG.is_supported_eltype(dev, elt)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     if TestITensorDMRG.is_broken(dev, elt, Val(conserve_qns))
@@ -21,6 +22,8 @@ using SafeTestsets: @safetestset
     end
   end
   using ITensors.ITensorsNamedDimsArraysExt: to_nameddimsarray
+  ## Without this line this test was throwing an error of ``NDTensors` not defined`
+  using NDTensors: NDTensors
   @testset "Test DMRG with NamedDimsArrays" for dev in (NDTensors.cpu,),
     conserve_qns in [false],
     elt in (Float32, Float64),
diff --git a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
new file mode 100644
index 0000000000..945c6653eb
--- /dev/null
+++ b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
@@ -0,0 +1,10 @@
+module NDTensorsTestUtils
+
+using NDTensors
+
+include("device_list.jl")
+include("is_supported_eltype.jl")
+
+default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
+
+end
diff --git a/NDTensors/test/device_list.jl b/NDTensors/test/NDTensorsTestUtils/device_list.jl
similarity index 96%
rename from NDTensors/test/device_list.jl
rename to NDTensors/test/NDTensorsTestUtils/device_list.jl
index 4efb4b1f7f..c2a8a36676 100644
--- a/NDTensors/test/device_list.jl
+++ b/NDTensors/test/NDTensorsTestUtils/device_list.jl
@@ -24,7 +24,6 @@ function devices_list(test_args)
 
   if "metal" in test_args || "all" in test_args
     push!(devs, NDTensors.mtl)
-    Metal.allowscalar()
   end
   return devs
 end
diff --git a/NDTensors/test/NDTensorsTestUtils/is_supported_eltype.jl b/NDTensors/test/NDTensorsTestUtils/is_supported_eltype.jl
new file mode 100644
index 0000000000..0dc16531e8
--- /dev/null
+++ b/NDTensors/test/NDTensorsTestUtils/is_supported_eltype.jl
@@ -0,0 +1,5 @@
+is_supported_eltype(dev, elt::Type) = true
+is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{Float64}) = false
+function is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{<:Complex})
+  return is_supported_eltype(dev, real(elt))
+end
diff --git a/NDTensors/test/readwrite.jl b/NDTensors/test/broken/readwrite.jl
similarity index 92%
rename from NDTensors/test/readwrite.jl
rename to NDTensors/test/broken/readwrite.jl
index 4122a65b52..50576d3aa5 100644
--- a/NDTensors/test/readwrite.jl
+++ b/NDTensors/test/broken/readwrite.jl
@@ -1,3 +1,5 @@
+## TODO this file was not included in the previous testing
+## and appears to be out of date with current code.
 using NDTensors, Test
 using HDF5
 
diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index 1e8beed07b..872ec7f5b1 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -1,43 +1,25 @@
-using Test
-using SafeTestsets
-
-println("Passing arguments ARGS=$(ARGS) to test.")
-
-if isempty(ARGS) || "base" in ARGS
-  println(
-    """\nArguments ARGS = $(ARGS) are empty, or contain `"base"`. Running cpu NDTensors tests.""",
-  )
-end
-if "cuda" in ARGS || "all" in ARGS
-  println("""\nArguments ARGS = $(ARGS) contain `"cuda"`. Running NDTensorCUDA tests.""")
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  println("""\nArguments ARGS = $(ARGS) contain`"metal"`. Running NDTensorMetal tests.""")
-  using Metal
-end
+using SafeTestsets: @safetestset
 
 @safetestset "NDTensors" begin
-  @testset "$filename" for filename in [
-    "arraytensor/runtests.jl",
-    "ITensors/runtests.jl",
-    "lib/runtests.jl",
-    "blocksparse.jl",
-    "combiner.jl",
-    "dense.jl",
-    "diag.jl",
-    "diagblocksparse.jl",
-    "emptynumber.jl",
-    "emptystorage.jl",
-    "linearalgebra.jl",
-  ]
-    println("Running $filename")
-    include(filename)
+  using Test: @testset
+  @testset "$(@__DIR__)" begin
+    filenames = filter(readdir(@__DIR__)) do f
+      startswith("test_")(f) && endswith(".jl")(f)
+    end
+    for dir in ["lib", "arraytensor", "ITensors"]
+      push!(filenames, joinpath(dir, "runtests.jl"))
+    end
+    @testset "Test $(@__DIR__)/$filename" for filename in filenames
+      println("Running $(@__DIR__)/$filename")
+      include(filename)
+    end
   end
   if "cuda" in ARGS || "all" in ARGS
+    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorCUDA.jl"))
   end
   if "metal" in ARGS || "all" in ARGS
+    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorMetal.jl"))
   end
 end
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/test_blocksparse.jl
similarity index 72%
rename from NDTensors/test/blocksparse.jl
rename to NDTensors/test/test_blocksparse.jl
index e85580ab80..eefaf51734 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -1,22 +1,27 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using LinearAlgebra: Hermitian, exp, svd
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol, devices_list, is_supported_eltype
 
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
-  include("device_list.jl")
-  devs = devices_list(copy(ARGS))
 
-  @testset "test device: $dev" for dev in devs
-    elt = (dev == NDTensors.mtl ? Float32 : Float64)
+  @testset "test device: $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
+    elt in (Float32, Float64)
+
+    if !is_supported_eltype(dev, elt)
+      continue
+    end
     # Indices
     indsA = ([2, 3], [4, 5])
 
     # Locations of non-zero blocks
     locs = [(1, 2), (2, 1)]
 
-    A = dev(BlockSparseTensor(locs, indsA...))
+    A = dev(BlockSparseTensor{elt}(locs, indsA...))
     randn!(A)
 
     @test blockdims(A, (1, 2)) == (2, 5)
@@ -72,7 +77,7 @@ using GPUArraysCore: @allowscalar
       @test A12[I] == A[I + CartesianIndex(0, 4)]
     end
 
-    B = dev(BlockSparseTensor(undef, locs, indsA))
+    B = dev(BlockSparseTensor(elt, undef, locs, indsA))
     randn!(B)
 
     C = A + B
@@ -98,7 +103,7 @@ using GPUArraysCore: @allowscalar
     @test typeof(conj(A)) <: BlockSparseTensor
 
     @testset "Random constructor" begin
-      T = dev(randomBlockSparseTensor([(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
       @test nnzblocks(T) == 2
       @test nnz(T) == 8
       @test eltype(T) == elt
@@ -125,20 +130,21 @@ using GPUArraysCore: @allowscalar
       @test eltype(cT) == complex(elt)
       @test nnzblocks(cT) == nnzblocks(T)
     end
+
     @testset "similartype regression test" begin
       # Regression test for issue seen in:
       # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
       # Previously, `similartype` wasn't using information about the dimensions
       # properly and was returning a `BlockSparse` storage of the dimensions
       # of the input tensor.
-      T = dev(BlockSparseTensor([(1, 1)], ([2], [2])))
+      T = dev(BlockSparseTensor(elt, [(1, 1)], ([2], [2])))
       @test NDTensors.ndims(
         NDTensors.storagetype(NDTensors.similartype(typeof(T), ([2], [2], [2])))
       ) == 3
     end
 
     @testset "Random constructor" begin
-      T = dev(randomBlockSparseTensor([(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
       @test nnzblocks(T) == 2
       @test nnz(T) == 8
       @test eltype(T) == elt
@@ -154,7 +160,7 @@ using GPUArraysCore: @allowscalar
     @testset "permute_combine" begin
       indsA = ([2, 3], [4, 5], [6, 7, 8])
       locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
-      A = dev(BlockSparseTensor(locsA, indsA...))
+      A = dev(BlockSparseTensor{elt}(locsA, indsA...))
       randn!(A)
 
       B = NDTensors.permute_combine(A, 3, (2, 1))
@@ -225,73 +231,73 @@ using GPUArraysCore: @allowscalar
     @test isblocknz(T, (2, 2))
   end
 
-  @testset "svd on $dev" for dev in devs
+  @testset "svd on $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
+    elt in (Float32, Float64)
+
+    if dev == NDTensors.mtl && elt == Float64
+      continue
+    end
     @testset "svd example 1" begin
-      A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
+      A = dev(BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=1e-14
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 2" begin
-      A = dev(BlockSparseTensor([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
+      A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 3" begin
-      A = dev(BlockSparseTensor([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
+      A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 4" begin
-      A = dev(BlockSparseTensor([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
+      A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 5" begin
-      A = dev(BlockSparseTensor([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
+      A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(elt)
     end
   end
 
-  @testset "exp" begin
-    A = BlockSparseTensor([(1, 1), (2, 2)], [2, 4], [2, 4])
+  @testset "exp, eltype: $elt" for elt in (Float32, Float64)
+    A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 4], [2, 4])
     randn!(A)
     expT = exp(A)
-    @test isapprox(norm(array(expT) - exp(array(A))), 0.0; atol=1e-13)
+    @test array(expT) ≈ exp(array(A))
+    atol = default_rtol(elt)
 
     # Hermitian case
-    A = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+    A = BlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2]))
     randn!(A)
-    Ah = BlockSparseTensor(ComplexF64, undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+    Ah = BlockSparseTensor(complex(elt), undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
     for bA in eachnzblock(A)
       b = blockview(A, bA)
       blockview(Ah, bA) .= b + b'
     end
     expTh = exp(Hermitian(Ah))
-    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = 1e-13
+    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = default_rtol(eltype(Ah))
 
-    A = BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2])
+    A = BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2])
     @test_throws ErrorException exp(A)
   end
 end
-
-nothing
+end
diff --git a/NDTensors/test/combiner.jl b/NDTensors/test/test_combiner.jl
similarity index 79%
rename from NDTensors/test/combiner.jl
rename to NDTensors/test/test_combiner.jl
index 52e1cae5b4..989b0701c9 100644
--- a/NDTensors/test/combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -1,22 +1,27 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 # Testing generic block indices
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  include("device_list.jl")
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
+    elt in (Float64, Float32)
+
+    if !is_supported_eltype(dev, elt)
+      continue
+    end
     @testset "Dense * Combiner" begin
       d = 2
       input_tensor_inds = (d, d, d)
       combiner_tensor_inds = (d^2, d, d)
       output_tensor_inds = (d, d^2)
 
-      input_tensor = dev(tensor(Dense(randn(input_tensor_inds)), input_tensor_inds))
+      input_tensor = dev(tensor(Dense(randn(elt, input_tensor_inds)), input_tensor_inds))
       combiner_tensor = dev(tensor(Combiner([1], [1]), combiner_tensor_inds))
 
       output_tensor = contract(input_tensor, (1, -1, -2), combiner_tensor, (2, -1, -2))
@@ -32,7 +37,7 @@ using ITensors: QN, Index
 
       # Catch invalid combining
       input_tensor_inds = (d,)
-      input_tensor = dev(tensor(Dense(randn(input_tensor_inds)), input_tensor_inds))
+      input_tensor = dev(tensor(Dense(randn(elt, input_tensor_inds)), input_tensor_inds))
       combiner_tensor = dev(tensor(Combiner([1], [1]), combiner_tensor_inds))
       @test_throws Any contract(input_tensor, (-1,), combiner_tensor, (1, -1, -2))
     end
@@ -51,7 +56,7 @@ using ITensors: QN, Index
       input_tensor = dev(
         tensor(
           BlockSparse(
-            randn(dim(input_tensor_inds)), BlockOffsets{3}([Block(1, 1, 1)], [0])
+            randn(elt, dim(input_tensor_inds)), BlockOffsets{3}([Block(1, 1, 1)], [0])
           ),
           input_tensor_inds,
         ),
@@ -76,7 +81,7 @@ using ITensors: QN, Index
       invalid_input_tensor = dev(
         tensor(
           BlockSparse(
-            randn(dim(invalid_input_tensor_inds)), BlockOffsets{1}([Block(1)], [0])
+            randn(elt, dim(invalid_input_tensor_inds)), BlockOffsets{1}([Block(1)], [0])
           ),
           invalid_input_tensor_inds,
         ),
@@ -86,3 +91,4 @@ using ITensors: QN, Index
     end
   end
 end
+end
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/test_dense.jl
similarity index 93%
rename from NDTensors/test/dense.jl
rename to NDTensors/test/test_dense.jl
index e716c2b4f0..2180877e4a 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -1,13 +1,13 @@
+@eval module $(gensym())
 using NDTensors
-using Test
+using Test: @testset, @test, @test_throws, @test_broken
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list
 
 @testset "Dense Tensors" begin
-  include("device_list.jl")
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS))
     elt = dev == NDTensors.mtl ? Float32 : Float64
-
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
       A = dev(Tensor(elt, (3, 4)))
@@ -33,7 +33,7 @@ using GPUArraysCore: @allowscalar
       Aview = A[2:3, 2:3]
       @test dims(Aview) == (2, 2)
 
-      B = dev(Tensor(undef, (3, 4)))
+      B = dev(Tensor(elt, undef, (3, 4)))
       randn!(B)
       C = copy(A)
       C = permutedims!!(C, B, (1, 2), +)
@@ -68,10 +68,8 @@ using GPUArraysCore: @allowscalar
         @test A[2, 2] == Aview[1, 1]
       end
 
-      ## Right now this treats the `Tensor` type as an abstract Array 
-      ## And calls getindex instead of CUDA.==. Can fix by converting to CPU or 
-      ## Just looking at the data
-      @test data(A * 2.0) == data(2.0 * A)
+      ## add elt around 2.0 to preserve the eltype of A.
+      @test data(A * elt(2.0)) == data(elt(2.0) * A)
 
       Asim = similar(data(A), 10)
       @test eltype(Asim) == elt
@@ -116,8 +114,8 @@ using GPUArraysCore: @allowscalar
       @test dim(I) == 1000
       @test Array(I) == I_arr
 
-      J = dev(Tensor((2, 2)))
-      K = dev(Tensor((2, 2)))
+      J = dev(Tensor(elt, (2, 2)))
+      K = dev(Tensor(elt, (2, 2)))
       @test Array(J * K) ≈ Array(J) * Array(K)
     end
 
@@ -213,7 +211,7 @@ using GPUArraysCore: @allowscalar
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
         @test @allowscalar any(isnan, R)
-        T1 = dev(randomTensor((2, 2, 1)))
+        T1 = dev(randomTensor(elt, (2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (1, 2, 3), T1, (1, 2, -1), T2, (-1, 1))
         @test @allowscalar !any(isnan, R)
@@ -224,7 +222,7 @@ using GPUArraysCore: @allowscalar
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
         @test @allowscalar any(isnan, R)
-        T1 = dev(randomTensor((2, 2, 1)))
+        T1 = dev(randomTensor(elt, (2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (2, 1, 3), T1, (1, 2, -1), T2, (-1, 1))
         @test @allowscalar !any(isnan, R)
@@ -276,3 +274,4 @@ using GPUArraysCore: @allowscalar
 end
 
 nothing
+end
diff --git a/NDTensors/test/diag.jl b/NDTensors/test/test_diag.jl
similarity index 87%
rename from NDTensors/test/diag.jl
rename to NDTensors/test/test_diag.jl
index 168049a9ef..835ebe66ca 100644
--- a/NDTensors/test/diag.jl
+++ b/NDTensors/test/test_diag.jl
@@ -1,14 +1,15 @@
+@eval module $(gensym())
 using NDTensors
-using Test
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 @testset "DiagTensor basic functionality" begin
-  include("device_list.jl")
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs,
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS)),
     elt in (Float32, ComplexF32, Float64, ComplexF64)
 
-    if dev == NDTensors.mtl && real(elt) ≠ Float32
+    if !is_supported_eltype(dev, elt)
       # Metal doesn't support double precision
       continue
     end
@@ -64,3 +65,4 @@ end
   @test contract(A, (-2, 1), t, (-2, 3)) == transpose(A)
 end
 nothing
+end
diff --git a/NDTensors/test/diagblocksparse.jl b/NDTensors/test/test_diagblocksparse.jl
similarity index 89%
rename from NDTensors/test/diagblocksparse.jl
rename to NDTensors/test/test_diagblocksparse.jl
index 94cc3e6ff4..f47f4b5da0 100644
--- a/NDTensors/test/diagblocksparse.jl
+++ b/NDTensors/test/test_diagblocksparse.jl
@@ -1,6 +1,7 @@
-using Dictionaries
+@eval module $(gensym())
+using Dictionaries: Dictionary
 using NDTensors
-using Test
+using Test: @testset, @test
 
 @testset "UniformDiagBlockSparseTensor basic functionality" begin
   NeverAlias = NDTensors.NeverAlias
@@ -24,3 +25,4 @@ using Test
   @test conj(NeverAlias(), tensor)[1, 1] == conj(c)
   @test conj(AllowAlias(), tensor)[1, 1] == conj(c)
 end
+end
diff --git a/NDTensors/test/emptynumber.jl b/NDTensors/test/test_emptynumber.jl
similarity index 90%
rename from NDTensors/test/emptynumber.jl
rename to NDTensors/test/test_emptynumber.jl
index 34cdb8e2ce..dc8357a115 100644
--- a/NDTensors/test/emptynumber.jl
+++ b/NDTensors/test/test_emptynumber.jl
@@ -1,6 +1,6 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using Test: @testset, @test, @test_throws
 
 const 𝟎 = NDTensors.EmptyNumber()
 
@@ -29,3 +29,4 @@ const 𝟎 = NDTensors.EmptyNumber()
   @test norm(𝟎) == 0.0
   @test norm(𝟎) isa Float64
 end
+end
diff --git a/NDTensors/test/emptystorage.jl b/NDTensors/test/test_emptystorage.jl
similarity index 82%
rename from NDTensors/test/emptystorage.jl
rename to NDTensors/test/test_emptystorage.jl
index 0bb37b7c1c..1f82ae2a57 100644
--- a/NDTensors/test/emptystorage.jl
+++ b/NDTensors/test/test_emptystorage.jl
@@ -1,10 +1,11 @@
+@eval module $(gensym())
 using NDTensors
-using Test
+using Test: @testset, @test
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list
 
 @testset "EmptyStorage test" begin
-  include("device_list.jl")
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS))
     T = dev(Tensor(EmptyStorage(NDTensors.EmptyNumber), (2, 2)))
     @test size(T) == (2, 2)
     @test eltype(T) == NDTensors.EmptyNumber
@@ -31,3 +32,4 @@ using Test
     @test zero(T) isa typeof(T)
   end
 end
+end
diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
similarity index 80%
rename from NDTensors/test/linearalgebra.jl
rename to NDTensors/test/test_linearalgebra.jl
index 11e94dd362..289b2c4625 100644
--- a/NDTensors/test/linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -1,7 +1,11 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using NDTensors: cpu
+using LinearAlgebra: Diagonal, qr, diag
+using Test: @testset, @test
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 @testset "random_orthog" begin
   n, m = 10, 4
@@ -21,20 +25,18 @@ end
   @test norm(U2 * U2' - Diagonal(fill(1.0, m))) < 1E-14
 end
 
-include("device_list.jl")
-devs = devices_list(copy(ARGS))
 @testset "QX testing" begin
   @testset "Dense $qx decomposition, elt=$elt, positve=$positive, singular=$singular, device=$dev" for qx in
                                                                                                        [
       qr, ql
     ],
-    elt in [Float64, ComplexF64, Float32, ComplexF32],
+    elt in (Float64, ComplexF64, Float32, ComplexF32),
     positive in [false, true],
     singular in [false, true],
-    dev in devs
+    dev in devices_list(copy(ARGS))
 
     ## Skip Float64 on Metal
-    if dev == NDTensors.mtl && (elt == Float64 || elt == ComplexF64)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     eps = Base.eps(real(elt)) * 100 #this is set rather tight, so if you increase/change m,n you may have open up the tolerance on eps.
@@ -52,9 +54,9 @@ devs = devices_list(copy(ARGS))
     end
     Q, X = qx(A; positive=positive) #X is R or L.
     Ap = Q * X
-    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
-    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
-    @test NDTensors.cpu(array(Q) * array(Q)') ≈ Id atol = eps
+    @test cpu(A) ≈ cpu(Ap) atol = eps
+    @test cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @test cpu(array(Q) * array(Q)') ≈ Id atol = eps
     @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0
@@ -74,8 +76,8 @@ devs = devices_list(copy(ARGS))
     end
     Q, X = qx(A; positive=positive)
     Ap = Q * X
-    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
-    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @test cpu(A) ≈ cpu(Ap) atol = eps
+    @test cpu(array(Q)' * array(Q)) ≈ Id atol = eps
     @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0
@@ -87,3 +89,4 @@ devs = devices_list(copy(ARGS))
 end
 
 nothing
+end
diff --git a/NDTensors/test/tupletools.jl b/NDTensors/test/test_tupletools.jl
similarity index 65%
rename from NDTensors/test/tupletools.jl
rename to NDTensors/test/test_tupletools.jl
index f8e0ec1ffa..3af9a1e5bc 100644
--- a/NDTensors/test/tupletools.jl
+++ b/NDTensors/test/test_tupletools.jl
@@ -1,5 +1,6 @@
-using Test
-using NDTensors
+@eval module $(gensym())
+using Test: @testset, @test
+using NDTensors: NDTensors
 
 @testset "Test non-exported tuple tools" begin
   @test NDTensors.diff((1, 3, 6, 4)) == (2, 3, -2)
@@ -7,3 +8,4 @@ using NDTensors
 end
 
 nothing
+end
diff --git a/src/ITensors.jl b/src/ITensors.jl
index bc96ecfc81..e91fb6959d 100644
--- a/src/ITensors.jl
+++ b/src/ITensors.jl
@@ -71,6 +71,7 @@ using Random
 using SerializedElementArrays
 using StaticArrays
 using TimerOutputs
+using TupleTools
 using Zeros
 
 #####################################
diff --git a/src/mps/dmrg.jl b/src/mps/dmrg.jl
index eda11f5292..49444e606e 100644
--- a/src/mps/dmrg.jl
+++ b/src/mps/dmrg.jl
@@ -10,7 +10,7 @@ function permute(
   for n in 1:length(M)
     lₙ₋₁ = linkind(M, n - 1)
     lₙ = linkind(M, n)
-    s⃗ₙ = sort(Tuple(siteinds(M, n)); by=plev)
+    s⃗ₙ = NDTensors.TupleTools.sort(Tuple(siteinds(M, n)); by=plev)
     M̃[n] = permute(M[n], filter(!isnothing, (lₙ₋₁, s⃗ₙ..., lₙ)))
   end
   set_ortho_lims!(M̃, ortho_lims(M))
diff --git a/src/physics/fermions.jl b/src/physics/fermions.jl
index 262721746e..289576436a 100644
--- a/src/physics/fermions.jl
+++ b/src/physics/fermions.jl
@@ -157,8 +157,8 @@ end
   # may be a tuple of QNIndex, so convert to a Vector{Index}
   indsR = collect(input_indsR)
 
-  nlabelsT1 = NDTensors.sort(labelsT1; rev=true)
-  nlabelsT2 = NDTensors.sort(labelsT2)
+  nlabelsT1 = TupleTools.sort(labelsT1; rev=true)
+  nlabelsT2 = TupleTools.sort(labelsT2)
 
   # Make orig_labelsR from the order of
   # indices that would result by just 
diff --git a/test/base/test_decomp.jl b/test/base/test_decomp.jl
index 086d0aea40..a4c8fc1554 100644
--- a/test/base/test_decomp.jl
+++ b/test/base/test_decomp.jl
@@ -140,7 +140,7 @@ end
                                                                                                     [
       0, 1, 2, 3
     ],
-    elt in [Float64, ComplexF64]
+    elt in (Float64, ComplexF64)
 
     l = Index(5, "l")
     s = Index(2, "s")