From ab39bed7af0e15ae9499fe3b92f64e129f2fe124 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Thu, 16 Nov 2023 17:40:35 -0500
Subject: [PATCH 01/73] Same issue on Metal and CUDA

---
 NDTensors/ext/NDTensorsMetalExt/mul.jl | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index 7c388a0fc8..93e15319df 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -19,3 +19,21 @@ function LinearAlgebra.mul!(
   mul!(CM', BM', AM', α, β)
   return unexpose(CM)
 end
+
+## Fix issue in Metal.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
+## as a CuArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:MtlArray},
+  AM::Exposed{<:MtlArray},
+  BM::Exposed{
+    <:MtlArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
\ No newline at end of file

From 5b63ddd9124fde229097c2eff1892f96f6e3e769 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 11:00:19 -0500
Subject: [PATCH 02/73] Replace _gemm! with mul!!

---
 NDTensors/src/dense/tensoralgebra/outer.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
index 591b3bc160..9ac12e4590 100644
--- a/NDTensors/src/dense/tensoralgebra/outer.jl
+++ b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -25,7 +25,9 @@ function outer!(
   #RM = reshape(array(R), length(v1), length(v2))
   #RM .= v1 .* transpose(v2)
   #mul!(RM, v1, transpose(v2))
-  _gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), RM)
+  #ger!(one(ElR), zero(ElR), v1, v2)
+  #_gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), array(RM))
+  mul!!(array(RM), v1, transpose(v2), one(ElR), zero(ElR))
   #mul!!(RM, v1, transpose(v2), one(ElR), zero(ElR))
   return R
 end

From 478131fefe3bce799211a5946dd6778297c36e13 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 11:09:41 -0500
Subject: [PATCH 03/73] Add comment

---
 NDTensors/src/dense/tensoralgebra/outer.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
index 9ac12e4590..b2143c77f7 100644
--- a/NDTensors/src/dense/tensoralgebra/outer.jl
+++ b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -27,8 +27,9 @@ function outer!(
   #mul!(RM, v1, transpose(v2))
   #ger!(one(ElR), zero(ElR), v1, v2)
   #_gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), array(RM))
+  ## There is no _gemm! defined for CUDA or Metal so it calls 
+  ## generic matmul. Replace with mul!! to call correct mul!! (ger)
   mul!!(array(RM), v1, transpose(v2), one(ElR), zero(ElR))
-  #mul!!(RM, v1, transpose(v2), one(ElR), zero(ElR))
   return R
 end
 

From 1324868d7353378058dbc390c4cf6984835a1244 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 12:55:26 -0500
Subject: [PATCH 04/73] Fix some issues with permutedims in Metal

---
 NDTensors/ext/NDTensorsMetalExt/permutedims.jl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index ad29610527..ce31a8abb9 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -1,3 +1,8 @@
+function Base.permutedims(E::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm)
+  A = copy(E)
+  return permutedims(expose(A), perm)
+end
+## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray,<:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray}, perm
 )
@@ -5,3 +10,12 @@ function Base.permutedims!(
   copyto!(expose(parent(Edest)), expose(Aperm))
   return unexpose(Edest)
 end
+
+## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
+## To get around this copy and permute Esrc, reshape to the size of Edest's parent
+## and broadcast into the parent.
+function Base.permutedims!(Edest::Exposed{<:MtlArray, <:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm, f)
+  Aperm = unwrap_type(Esrc)(reshape(permutedims(Esrc, perm), size(parent(Edest))))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
\ No newline at end of file

From 526bf4a4ba152a4dae72db330ebcf8fc60112669 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 12:56:12 -0500
Subject: [PATCH 05/73] Change the default_eltype of NDTensors to Float32 if
 using Mtl

---
 NDTensors/test/blocksparse.jl | 7 ++++++-
 NDTensors/test/dense.jl       | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index e85580ab80..b1362debc3 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -9,7 +9,12 @@ using GPUArraysCore: @allowscalar
   devs = devices_list(copy(ARGS))
 
   @testset "test device: $dev" for dev in devs
-    elt = (dev == NDTensors.mtl ? Float32 : Float64)
+    if dev == NDTensors.mtl
+      elt = Float32
+      NDTensors.default_eltype() = Float32
+    else
+      elt = Float64
+    end
     # Indices
     indsA = ([2, 3], [4, 5])
 
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
index e716c2b4f0..b5664eb763 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/dense.jl
@@ -6,7 +6,12 @@ using GPUArraysCore: @allowscalar
   include("device_list.jl")
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs
-    elt = dev == NDTensors.mtl ? Float32 : Float64
+    if dev == NDTensors.mtl
+      elt = Float32
+      NDTensors.default_eltype() = Float32
+    else
+      elt = Float64
+    end
 
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin

From 4b9be1de2eeb35b97c53d906ba060bacb0febf6c Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 13:07:56 -0500
Subject: [PATCH 06/73] Blocksparse shouldn't use default of Float64. Use
 NDTensors.default_eltype

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index f59238f955..16218ed5b4 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -49,7 +49,7 @@ Construct a block sparse tensor with uninitialized memory
 from indices and locations of non-zero blocks.
 """
 function BlockSparseTensor(::UndefInitializer, blockoffsets, inds)
-  return BlockSparseTensor(Float64, undef, blockoffsets, inds)
+  return BlockSparseTensor(NDTensors.default_eltype(), undef, blockoffsets, inds)
 end
 
 function BlockSparseTensor(
@@ -65,7 +65,7 @@ function BlockSparseTensor(eltype::Type{<:Number}, blockoffsets::BlockOffsets, i
 end
 
 function BlockSparseTensor(blockoffsets::BlockOffsets, inds)
-  return BlockSparseTensor(Float64, blockoffsets, inds)
+  return BlockSparseTensor(NDTensors.default_eltype(), blockoffsets, inds)
 end
 
 """
@@ -73,7 +73,7 @@ end
 
 Construct a block sparse tensor with no blocks.
 """
-BlockSparseTensor(inds) = BlockSparseTensor(Float64, inds)
+BlockSparseTensor(inds) = BlockSparseTensor(FloNDTensors.default_eltype()at64, inds)
 
 function BlockSparseTensor(datatype::Type{<:AbstractArray}, inds)
   return BlockSparseTensor(datatype, BlockOffsets{length(inds)}(), inds)
@@ -99,7 +99,7 @@ Construct a block sparse tensor with the specified blocks.
 Defaults to setting structurally non-zero blocks to zero.
 """
 function BlockSparseTensor(blocks::Vector{BlockT}, inds) where {BlockT<:Union{Block,NTuple}}
-  return BlockSparseTensor(Float64, blocks, inds)
+  return BlockSparseTensor(NDTensors.default_eltype(), blocks, inds)
 end
 
 function BlockSparseTensor(
@@ -160,7 +160,7 @@ function randomBlockSparseTensor(blocks::Vector, inds)
 end
 
 function randomBlockSparseTensor(rng::AbstractRNG, blocks::Vector, inds)
-  return randomBlockSparseTensor(rng, Float64, blocks, inds)
+  return randomBlockSparseTensor(rng, NDTensors.default_eltype(), blocks, inds)
 end
 
 """

From 5e34f9dc89e251d4f24f157de295ba9455e7e512 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 13:08:16 -0500
Subject: [PATCH 07/73] Make sure to revert to original Float64

---
 NDTensors/test/dense.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
index b5664eb763..369a8f1ecf 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/dense.jl
@@ -236,6 +236,9 @@ using GPUArraysCore: @allowscalar
         @test convert(Array, R) ≈ permutedims(convert(Array, T1), (2, 1, 3)) * T2[]
       end
     end
+    if dev == NDTensors.mtl
+      NDTensors.default_eltype() = Float64
+    end
   end
 
   # Only CPU backend testing

From 3e93f9b1e24d4087cb375742d3773b997e500515 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 13:08:49 -0500
Subject: [PATCH 08/73] If in Float32 need to adjust atol. Also turn off and on
 Float32 appropriately

---
 NDTensors/test/blocksparse.jl | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index b1362debc3..29c15c59ed 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -174,6 +174,9 @@ using GPUArraysCore: @allowscalar
         @test reshape(blockAp, size(blockB)) == blockB
       end
     end
+    if dev == NDTensors.mtl
+      NDTensors.default_eltype() = Float64
+    end
   end
 
   @testset "BlockSparseTensor setindex! add block" begin
@@ -231,12 +234,16 @@ using GPUArraysCore: @allowscalar
   end
 
   @testset "svd on $dev" for dev in devs
+    if dev == NDTensors.mtl
+      NDTensors.default_eltype() = Float32
+    end
+    atol = 10^(0.75 * log10(eps(real(elt))))
     @testset "svd example 1" begin
       A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=1e-14
+        norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=atol
       )
     end
 
@@ -245,7 +252,7 @@ using GPUArraysCore: @allowscalar
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
       )
     end
 
@@ -254,7 +261,7 @@ using GPUArraysCore: @allowscalar
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
       )
     end
 
@@ -263,7 +270,7 @@ using GPUArraysCore: @allowscalar
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
       )
     end
 
@@ -272,9 +279,12 @@ using GPUArraysCore: @allowscalar
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
       )
     end
+    if dev == NDTensors.mtl
+      NDTensors.default_eltype() = Float64
+    end
   end
 
   @testset "exp" begin

From 8f9f46c2922e9827f3595e3137da4a97bb1e68c3 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 13:10:30 -0500
Subject: [PATCH 09/73] There is no check when multiplying numbers this might
 lead to an issue in numerical stability

---
 NDTensors/src/tensorstorage/tensorstorage.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NDTensors/src/tensorstorage/tensorstorage.jl b/NDTensors/src/tensorstorage/tensorstorage.jl
index 9708379757..7ddb20777e 100644
--- a/NDTensors/src/tensorstorage/tensorstorage.jl
+++ b/NDTensors/src/tensorstorage/tensorstorage.jl
@@ -28,7 +28,8 @@ Base.@propagate_inbounds function Base.setindex!(S::TensorStorage, v, i::Integer
   return (setindex!(data(S), v, i); S)
 end
 
-(S::TensorStorage * x::Number) = setdata(S, x * data(S))
+## Missing a check or conversion when calling number * Tensor. This causes Metal to fail numerically because it tries to convert it to Float64. Preserve S eltype. ## TODO this could probably be handled differently/better?
+(S::TensorStorage * x::Number) = setdata(S, eltype(S)(x) * data(S))
 (x::Number * S::TensorStorage) = S * x
 (S::TensorStorage / x::Number) = setdata(S, data(S) / x)
 

From 911542d53c559a9089a7022d197ac8ff5cc69095 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 17 Nov 2023 13:11:21 -0500
Subject: [PATCH 10/73] Format

---
 NDTensors/ext/NDTensorsMetalExt/mul.jl         |  2 +-
 NDTensors/ext/NDTensorsMetalExt/permutedims.jl | 11 ++++++++---
 NDTensors/test/device_list.jl                  |  1 -
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index 93e15319df..b9e8667155 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -36,4 +36,4 @@ function LinearAlgebra.mul!(
 )
   mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
   return unexpose(CM)
-end
\ No newline at end of file
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index ce31a8abb9..f507ede599 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -1,4 +1,4 @@
-function Base.permutedims(E::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm)
+function Base.permutedims(E::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm)
   A = copy(E)
   return permutedims(expose(A), perm)
 end
@@ -14,8 +14,13 @@ end
 ## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 ## To get around this copy and permute Esrc, reshape to the size of Edest's parent
 ## and broadcast into the parent.
-function Base.permutedims!(Edest::Exposed{<:MtlArray, <:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm, f)
+function Base.permutedims!(
+  Edest::Exposed{<:MtlArray,<:Base.ReshapedArray},
+  Esrc::Exposed{<:MtlArray,<:Base.ReshapedArray},
+  perm,
+  f,
+)
   Aperm = unwrap_type(Esrc)(reshape(permutedims(Esrc, perm), size(parent(Edest))))
   parent(Edest) .= f.(parent(Edest), Aperm)
   return unexpose(Edest)
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/device_list.jl b/NDTensors/test/device_list.jl
index 021a20a75b..12ff7c5d38 100644
--- a/NDTensors/test/device_list.jl
+++ b/NDTensors/test/device_list.jl
@@ -23,7 +23,6 @@ function devices_list(test_args)
 
   if "metal" in test_args || "all" in test_args
     push!(devs, NDTensors.mtl)
-    Metal.allowscalar()
   end
   return devs
 end

From 6780956383641a832c6c210ea1063fc5ee20eb5b Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 12:46:14 -0500
Subject: [PATCH 11/73] Create NDTensorsTestUtils

---
 .../NDTensorsTestUtils/NDTensorsTestUtils.jl  | 11 ++++++++
 .../test/NDTensorsTestUtils/device_list.jl    | 28 +++++++++++++++++++
 .../NDTensorsTestUtils/is_suppoted_eltype.jl  |  5 ++++
 3 files changed, 44 insertions(+)
 create mode 100644 NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
 create mode 100644 NDTensors/test/NDTensorsTestUtils/device_list.jl
 create mode 100644 NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl

diff --git a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
new file mode 100644
index 0000000000..02ceb080de
--- /dev/null
+++ b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
@@ -0,0 +1,11 @@
+module NDTensorsTestUtils
+
+  using NDTensors
+
+  include("device_list.jl")
+  include("is_suppoted_eltype.jl")
+
+  default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
+
+  export default_rtol, is_supported_eltype, devices_list;
+end
\ No newline at end of file
diff --git a/NDTensors/test/NDTensorsTestUtils/device_list.jl b/NDTensors/test/NDTensorsTestUtils/device_list.jl
new file mode 100644
index 0000000000..12ff7c5d38
--- /dev/null
+++ b/NDTensors/test/NDTensorsTestUtils/device_list.jl
@@ -0,0 +1,28 @@
+if "cuda" in ARGS || "all" in ARGS
+  using CUDA
+end
+if "metal" in ARGS || "all" in ARGS
+  using Metal
+end
+
+function devices_list(test_args)
+  devs = Vector{Function}(undef, 0)
+  if isempty(test_args) || "base" in test_args
+    push!(devs, NDTensors.cpu)
+  end
+
+  if "cuda" in test_args || "all" in test_args
+    if CUDA.functional()
+      push!(devs, NDTensors.cu)
+    else
+      println(
+        "Warning: CUDA.jl is not functional on this architecture and tests will be skipped."
+      )
+    end
+  end
+
+  if "metal" in test_args || "all" in test_args
+    push!(devs, NDTensors.mtl)
+  end
+  return devs
+end
diff --git a/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl b/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
new file mode 100644
index 0000000000..7b23bf3535
--- /dev/null
+++ b/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
@@ -0,0 +1,5 @@
+is_supported_eltype(dev, elt::Type) = true
+is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{Float64}) = false
+function is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{<:Complex})
+  return is_supported_eltype(dev, real(elt))
+end
\ No newline at end of file

From 1ab3418ed0ed311536f67e52e385fef63ddc4d0d Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 12:46:35 -0500
Subject: [PATCH 12/73] Move functions to NDTensorsTestUtils

---
 .../TestITensorDMRG/TestITensorDMRG.jl        | 28 +++++++------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
index 618f2b6ced..058c301103 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -2,26 +2,18 @@
 ## Failing for CUDA mostly with eigen (I believe there is some noise in
 ## eigen decomp with CUBLAS to give slightly different answer than BLAS)
 module TestITensorDMRG
-using Test
-using ITensors
-using NDTensors
-using Random
+  using Test
+  using ITensors
+  using NDTensors
+  using Random
 
-reference_energies = Dict([
-  (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
-])
+  reference_energies = Dict([
+    (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
+  ])
 
-default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
+  is_broken(dev, elt::Type, conserve_qns::Val) = false
+  is_broken(dev::typeof(NDTensors.cu), elt::Type, conserve_qns::Val{true}) = true
 
-is_supported_eltype(dev, elt::Type) = true
-is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{Float64}) = false
-function is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{<:Complex})
-  return is_supported_eltype(dev, real(elt))
-end
-
-is_broken(dev, elt::Type, conserve_qns::Val) = false
-is_broken(dev::typeof(NDTensors.cu), elt::Type, conserve_qns::Val{true}) = true
-
-include("dmrg.jl")
+  include("dmrg.jl")
 
 end

From 2ff61ffbe5d79441292aa29369f2a7fad202ffbb Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 12:49:28 -0500
Subject: [PATCH 13/73] Use NDTensorsTestUtils

---
 NDTensors/src/Unwrap/test/runtests.jl         |  4 ++-
 .../test/ITensors/TestITensorDMRG/dmrg.jl     |  2 ++
 .../test/ITensors/TestITensorDMRG/runtests.jl |  7 +++--
 NDTensors/test/blocksparse.jl                 | 24 ++++++----------
 NDTensors/test/combiner.jl                    |  4 ++-
 NDTensors/test/dense.jl                       |  4 ++-
 NDTensors/test/device_list.jl                 | 28 -------------------
 NDTensors/test/diag.jl                        |  4 ++-
 NDTensors/test/emptystorage.jl                |  4 ++-
 NDTensors/test/linearalgebra.jl               |  3 +-
 NDTensors/test/runtests.jl                    |  3 ++
 11 files changed, 35 insertions(+), 52 deletions(-)
 delete mode 100644 NDTensors/test/device_list.jl

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index 68cbb74e18..f742e150e6 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -4,7 +4,9 @@ using NDTensors
 using LinearAlgebra
 using GPUArraysCore: @allowscalar
 
-include("../../../test/device_list.jl")
+include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: devices_list
+
 @testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
   elt in (Float32, ComplexF32)
 
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
index ab73f5ff82..e9f3e26cfd 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
@@ -1,3 +1,5 @@
+include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol, is_supported_eltype, devices_list
 function test_dmrg(elt, N::Integer; dev::Function, conserve_qns)
   sites = siteinds("S=1/2", N; conserve_qns)
 
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
index bccc179656..3c451ba275 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
@@ -1,15 +1,16 @@
 using Test
 using NDTensors
+## TODO headergaurd
+include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol, is_supported_eltype, devices_list
 include("TestITensorDMRG.jl")
 
-include("../../device_list.jl")
-
 @testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in devices_list(ARGS),
   conserve_qns in [false, true],
   elt in (Float32, ComplexF32, Float64, ComplexF64),
   N in [4, 10]
 
-  if !TestITensorDMRG.is_supported_eltype(dev, elt)
+  if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
     continue
   end
   if TestITensorDMRG.is_broken(dev, elt, Val(conserve_qns))
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index 29c15c59ed..ed6d5f6dc1 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -2,10 +2,12 @@ using NDTensors
 using LinearAlgebra
 using Test
 using GPUArraysCore: @allowscalar
+## TODO headergaurd
+# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+# using .NDTensorsTestUtils: default_rtol, devices_list
 
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
-  include("device_list.jl")
   devs = devices_list(copy(ARGS))
 
   @testset "test device: $dev" for dev in devs
@@ -237,41 +239,33 @@ using GPUArraysCore: @allowscalar
     if dev == NDTensors.mtl
       NDTensors.default_eltype() = Float32
     end
-    atol = 10^(0.75 * log10(eps(real(elt))))
+    
     @testset "svd example 1" begin
       A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=atol
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
     end
 
     @testset "svd example 2" begin
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
     end
 
     @testset "svd example 3" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
     end
 
     @testset "svd example 4" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
-      )
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
     end
 
     @testset "svd example 5" begin
@@ -279,7 +273,7 @@ using GPUArraysCore: @allowscalar
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=atol
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=default_rtol(eltype(A))
       )
     end
     if dev == NDTensors.mtl
diff --git a/NDTensors/test/combiner.jl b/NDTensors/test/combiner.jl
index 52e1cae5b4..9c0f21fb75 100644
--- a/NDTensors/test/combiner.jl
+++ b/NDTensors/test/combiner.jl
@@ -2,12 +2,14 @@ using NDTensors
 using LinearAlgebra
 using Test
 using GPUArraysCore: @allowscalar
+## TODO headergaurd
+# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+# using .NDTensorsTestUtils: default_rtol, devices_list
 
 # Testing generic block indices
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  include("device_list.jl")
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs
     @testset "Dense * Combiner" begin
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
index 369a8f1ecf..adfc7bcdf6 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/dense.jl
@@ -1,9 +1,11 @@
 using NDTensors
 using Test
 using GPUArraysCore: @allowscalar
+## TODO headergaurd
+#include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+#using .NDTensorsTestUtils: default_rtol, devices_list
 
 @testset "Dense Tensors" begin
-  include("device_list.jl")
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs
     if dev == NDTensors.mtl
diff --git a/NDTensors/test/device_list.jl b/NDTensors/test/device_list.jl
deleted file mode 100644
index 12ff7c5d38..0000000000
--- a/NDTensors/test/device_list.jl
+++ /dev/null
@@ -1,28 +0,0 @@
-if "cuda" in ARGS || "all" in ARGS
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  using Metal
-end
-
-function devices_list(test_args)
-  devs = Vector{Function}(undef, 0)
-  if isempty(test_args) || "base" in test_args
-    push!(devs, NDTensors.cpu)
-  end
-
-  if "cuda" in test_args || "all" in test_args
-    if CUDA.functional()
-      push!(devs, NDTensors.cu)
-    else
-      println(
-        "Warning: CUDA.jl is not functional on this architecture and tests will be skipped."
-      )
-    end
-  end
-
-  if "metal" in test_args || "all" in test_args
-    push!(devs, NDTensors.mtl)
-  end
-  return devs
-end
diff --git a/NDTensors/test/diag.jl b/NDTensors/test/diag.jl
index 168049a9ef..62540485d1 100644
--- a/NDTensors/test/diag.jl
+++ b/NDTensors/test/diag.jl
@@ -1,9 +1,11 @@
 using NDTensors
 using Test
 using GPUArraysCore: @allowscalar
+## TODO headergaurd
+# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+# using .NDTensorsTestUtils: default_rtol, devices_list
 
 @testset "DiagTensor basic functionality" begin
-  include("device_list.jl")
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs,
     elt in (Float32, ComplexF32, Float64, ComplexF64)
diff --git a/NDTensors/test/emptystorage.jl b/NDTensors/test/emptystorage.jl
index 0bb37b7c1c..7c8651cb62 100644
--- a/NDTensors/test/emptystorage.jl
+++ b/NDTensors/test/emptystorage.jl
@@ -1,8 +1,10 @@
 using NDTensors
 using Test
+## TODO headergaurd
+# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+# using .NDTensorsTestUtils: devices_list
 
 @testset "EmptyStorage test" begin
-  include("device_list.jl")
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs
     T = dev(Tensor(EmptyStorage(NDTensors.EmptyNumber), (2, 2)))
diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/linearalgebra.jl
index 11e94dd362..051d7c03c5 100644
--- a/NDTensors/test/linearalgebra.jl
+++ b/NDTensors/test/linearalgebra.jl
@@ -2,6 +2,8 @@ using NDTensors
 using LinearAlgebra
 using Test
 using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol, devices_list
 
 @testset "random_orthog" begin
   n, m = 10, 4
@@ -21,7 +23,6 @@ end
   @test norm(U2 * U2' - Diagonal(fill(1.0, m))) < 1E-14
 end
 
-include("device_list.jl")
 devs = devices_list(copy(ARGS))
 @testset "QX testing" begin
   @testset "Dense $qx decomposition, elt=$elt, positve=$positive, singular=$singular, device=$dev" for qx in
diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index 34718b84ea..8e1cb7217f 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -1,6 +1,9 @@
 using Test
 using SafeTestsets
 
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol, devices_list
+
 println("Passing arguments ARGS=$(ARGS) to test.")
 
 if isempty(ARGS) || "base" in ARGS

From 09e95cf58a39ae49f53d0b7262ad72f850677689 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 13:55:48 -0500
Subject: [PATCH 14/73] Remove global modification of default_eltype

---
 NDTensors/test/blocksparse.jl | 24 +++---------------------
 NDTensors/test/dense.jl       | 10 ----------
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index ed6d5f6dc1..6fe74fb33f 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -11,12 +11,6 @@ using GPUArraysCore: @allowscalar
   devs = devices_list(copy(ARGS))
 
   @testset "test device: $dev" for dev in devs
-    if dev == NDTensors.mtl
-      elt = Float32
-      NDTensors.default_eltype() = Float32
-    else
-      elt = Float64
-    end
     # Indices
     indsA = ([2, 3], [4, 5])
 
@@ -176,9 +170,6 @@ using GPUArraysCore: @allowscalar
         @test reshape(blockAp, size(blockB)) == blockB
       end
     end
-    if dev == NDTensors.mtl
-      NDTensors.default_eltype() = Float64
-    end
   end
 
   @testset "BlockSparseTensor setindex! add block" begin
@@ -236,10 +227,6 @@ using GPUArraysCore: @allowscalar
   end
 
   @testset "svd on $dev" for dev in devs
-    if dev == NDTensors.mtl
-      NDTensors.default_eltype() = Float32
-    end
-    
     @testset "svd example 1" begin
       A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
@@ -272,12 +259,7 @@ using GPUArraysCore: @allowscalar
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar isapprox(
-        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=default_rtol(eltype(A))
-      )
-    end
-    if dev == NDTensors.mtl
-      NDTensors.default_eltype() = Float64
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
     end
   end
 
@@ -285,7 +267,7 @@ using GPUArraysCore: @allowscalar
     A = BlockSparseTensor([(1, 1), (2, 2)], [2, 4], [2, 4])
     randn!(A)
     expT = exp(A)
-    @test isapprox(norm(array(expT) - exp(array(A))), 0.0; atol=1e-13)
+    @test array(expT) ≈ exp(array(A)); atol=default_rtol(eltype(A))
 
     # Hermitian case
     A = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
@@ -296,7 +278,7 @@ using GPUArraysCore: @allowscalar
       blockview(Ah, bA) .= b + b'
     end
     expTh = exp(Hermitian(Ah))
-    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = 1e-13
+    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = default_rtol(eltype(Ah))
 
     A = BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2])
     @test_throws ErrorException exp(A)
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
index adfc7bcdf6..73fbae6a2c 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/dense.jl
@@ -8,13 +8,6 @@ using GPUArraysCore: @allowscalar
 @testset "Dense Tensors" begin
   devs = devices_list(copy(ARGS))
   @testset "test device: $dev" for dev in devs
-    if dev == NDTensors.mtl
-      elt = Float32
-      NDTensors.default_eltype() = Float32
-    else
-      elt = Float64
-    end
-
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
       A = dev(Tensor(elt, (3, 4)))
@@ -238,9 +231,6 @@ using GPUArraysCore: @allowscalar
         @test convert(Array, R) ≈ permutedims(convert(Array, T1), (2, 1, 3)) * T2[]
       end
     end
-    if dev == NDTensors.mtl
-      NDTensors.default_eltype() = Float64
-    end
   end
 
   # Only CPU backend testing

From cb2bd51b4ed7d82f72de9371cf927bdf83a2e727 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 13:56:46 -0500
Subject: [PATCH 15/73] format

---
 .../TestITensorDMRG/TestITensorDMRG.jl        | 20 +++++++++----------
 .../NDTensorsTestUtils/NDTensorsTestUtils.jl  | 12 +++++------
 .../NDTensorsTestUtils/is_suppoted_eltype.jl  |  2 +-
 NDTensors/test/blocksparse.jl                 | 18 +++++++++++------
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
index 058c301103..66ed6f0b6b 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -2,18 +2,18 @@
 ## Failing for CUDA mostly with eigen (I believe there is some noise in
 ## eigen decomp with CUBLAS to give slightly different answer than BLAS)
 module TestITensorDMRG
-  using Test
-  using ITensors
-  using NDTensors
-  using Random
+using Test
+using ITensors
+using NDTensors
+using Random
 
-  reference_energies = Dict([
-    (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
-  ])
+reference_energies = Dict([
+  (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
+])
 
-  is_broken(dev, elt::Type, conserve_qns::Val) = false
-  is_broken(dev::typeof(NDTensors.cu), elt::Type, conserve_qns::Val{true}) = true
+is_broken(dev, elt::Type, conserve_qns::Val) = false
+is_broken(dev::typeof(NDTensors.cu), elt::Type, conserve_qns::Val{true}) = true
 
-  include("dmrg.jl")
+include("dmrg.jl")
 
 end
diff --git a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
index 02ceb080de..5976ac8cc7 100644
--- a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
+++ b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
@@ -1,11 +1,11 @@
 module NDTensorsTestUtils
 
-  using NDTensors
+using NDTensors
 
-  include("device_list.jl")
-  include("is_suppoted_eltype.jl")
+include("device_list.jl")
+include("is_suppoted_eltype.jl")
 
-  default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
+default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
 
-  export default_rtol, is_supported_eltype, devices_list;
-end
\ No newline at end of file
+export default_rtol, is_supported_eltype, devices_list
+end
diff --git a/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl b/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
index 7b23bf3535..0dc16531e8 100644
--- a/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
+++ b/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
@@ -2,4 +2,4 @@ is_supported_eltype(dev, elt::Type) = true
 is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{Float64}) = false
 function is_supported_eltype(dev::typeof(NDTensors.mtl), elt::Type{<:Complex})
   return is_supported_eltype(dev, real(elt))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index 6fe74fb33f..df06923355 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -231,35 +231,40 @@ using GPUArraysCore: @allowscalar
       A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(eltype(A))
     end
 
     @testset "svd example 2" begin
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(eltype(A))
     end
 
     @testset "svd example 3" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(eltype(A))
     end
 
     @testset "svd example 4" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(eltype(A))
     end
 
     @testset "svd example 5" begin
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
       randn!(A)
       U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A); atol=default_rtol(eltype(A))
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = default_rtol(eltype(A))
     end
   end
 
@@ -267,7 +272,8 @@ using GPUArraysCore: @allowscalar
     A = BlockSparseTensor([(1, 1), (2, 2)], [2, 4], [2, 4])
     randn!(A)
     expT = exp(A)
-    @test array(expT) ≈ exp(array(A)); atol=default_rtol(eltype(A))
+    @test array(expT) ≈ exp(array(A))
+    atol = default_rtol(eltype(A))
 
     # Hermitian case
     A = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]))

From ed10f7134c6f7a0f839ba5532faf4444015f4f72 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 20 Nov 2023 17:04:29 -0500
Subject: [PATCH 16/73] Rename test files test_FILENAME.jl

---
 NDTensors/test/blocksparse.jl                 | 294 -----------------
 ...rseArrays.jl => test_BlockSparseArrays.jl} |   0
 ...agonalArrays.jl => test_DiagonalArrays.jl} |   0
 ...dDimsArrays.jl => test_NamedDimsArrays.jl} |   0
 ...SetParameters.jl => test_SetParameters.jl} |   0
 .../{SmallVectors.jl => test_SmallVectors.jl} |   0
 .../{SortedSets.jl => test_SortedSets.jl}     |   0
 .../test/{TagSets.jl => test_TagSets.jl}      |   0
 ...TensorAlgebra.jl => test_TensorAlgebra.jl} |   0
 NDTensors/test/test_blocksparse.jl            | 301 ++++++++++++++++++
 .../test/{combiner.jl => test_combiner.jl}    |   0
 NDTensors/test/{dense.jl => test_dense.jl}    |   0
 NDTensors/test/{diag.jl => test_diag.jl}      |   0
 ...blocksparse.jl => test_diagblocksparse.jl} |   0
 .../{emptynumber.jl => test_emptynumber.jl}   |   0
 .../{emptystorage.jl => test_emptystorage.jl} |   0
 ...linearalgebra.jl => test_linearalgebra.jl} |   0
 .../test/{readwrite.jl => test_readwrite.jl}  |   0
 18 files changed, 301 insertions(+), 294 deletions(-)
 delete mode 100644 NDTensors/test/blocksparse.jl
 rename NDTensors/test/{BlockSparseArrays.jl => test_BlockSparseArrays.jl} (100%)
 rename NDTensors/test/{DiagonalArrays.jl => test_DiagonalArrays.jl} (100%)
 rename NDTensors/test/{NamedDimsArrays.jl => test_NamedDimsArrays.jl} (100%)
 rename NDTensors/test/{SetParameters.jl => test_SetParameters.jl} (100%)
 rename NDTensors/test/{SmallVectors.jl => test_SmallVectors.jl} (100%)
 rename NDTensors/test/{SortedSets.jl => test_SortedSets.jl} (100%)
 rename NDTensors/test/{TagSets.jl => test_TagSets.jl} (100%)
 rename NDTensors/test/{TensorAlgebra.jl => test_TensorAlgebra.jl} (100%)
 create mode 100644 NDTensors/test/test_blocksparse.jl
 rename NDTensors/test/{combiner.jl => test_combiner.jl} (100%)
 rename NDTensors/test/{dense.jl => test_dense.jl} (100%)
 rename NDTensors/test/{diag.jl => test_diag.jl} (100%)
 rename NDTensors/test/{diagblocksparse.jl => test_diagblocksparse.jl} (100%)
 rename NDTensors/test/{emptynumber.jl => test_emptynumber.jl} (100%)
 rename NDTensors/test/{emptystorage.jl => test_emptystorage.jl} (100%)
 rename NDTensors/test/{linearalgebra.jl => test_linearalgebra.jl} (100%)
 rename NDTensors/test/{readwrite.jl => test_readwrite.jl} (100%)

diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
deleted file mode 100644
index df06923355..0000000000
--- a/NDTensors/test/blocksparse.jl
+++ /dev/null
@@ -1,294 +0,0 @@
-using NDTensors
-using LinearAlgebra
-using Test
-using GPUArraysCore: @allowscalar
-## TODO headergaurd
-# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-# using .NDTensorsTestUtils: default_rtol, devices_list
-
-@testset "BlockSparseTensor basic functionality" begin
-  C = nothing
-  devs = devices_list(copy(ARGS))
-
-  @testset "test device: $dev" for dev in devs
-    # Indices
-    indsA = ([2, 3], [4, 5])
-
-    # Locations of non-zero blocks
-    locs = [(1, 2), (2, 1)]
-
-    A = dev(BlockSparseTensor(locs, indsA...))
-    randn!(A)
-
-    @test blockdims(A, (1, 2)) == (2, 5)
-    @test blockdims(A, (2, 1)) == (3, 4)
-    @test nnzblocks(A) == 2
-    @test nnz(A) == 2 * 5 + 3 * 4
-    @test inds(A) == ([2, 3], [4, 5])
-    @test isblocknz(A, (2, 1))
-    @test isblocknz(A, (1, 2))
-    @test !isblocknz(A, (1, 1))
-    @test !isblocknz(A, (2, 2))
-
-    # Test different ways of getting nnz
-    @test nnz(blockoffsets(A), inds(A)) == nnz(A)
-
-    B = 2 * A
-    @test B[1, 1] == 2 * A[1, 1]
-    @test nnz(A) == 2 * 5 + 3 * 4
-    @test nnz(B) == 2 * 5 + 3 * 4
-    @test nnzblocks(A) == 2
-    @test nnzblocks(B) == 2
-
-    B = A / 2
-    @test B[1, 1] == A[1, 1] / 2
-    @test nnz(A) == 2 * 5 + 3 * 4
-    @test nnz(B) == 2 * 5 + 3 * 4
-    @test nnzblocks(A) == 2
-    @test nnzblocks(B) == 2
-
-    @allowscalar begin
-      A[1, 5] = 15
-      A[2, 5] = 25
-
-      @test A[1, 1] == 0
-      @test A[1, 5] == 15
-      @test A[2, 5] == 25
-    end
-    D = dense(A)
-
-    @allowscalar begin
-      @test D == A
-
-      for I in eachindex(A)
-        @test D[I] == A[I]
-      end
-    end
-
-    A12 = blockview(A, (1, 2))
-
-    @test dims(A12) == (2, 5)
-
-    @allowscalar for I in eachindex(A12)
-      @test A12[I] == A[I + CartesianIndex(0, 4)]
-    end
-
-    B = dev(BlockSparseTensor(undef, locs, indsA))
-    randn!(B)
-
-    C = A + B
-
-    @allowscalar for I in eachindex(C)
-      @test C[I] == A[I] + B[I]
-    end
-
-    Ap = permutedims(A, (2, 1))
-
-    @test blockdims(Ap, (1, 2)) == (4, 3)
-    @test blockdims(Ap, (2, 1)) == (5, 2)
-    @test nnz(A) == nnz(Ap)
-    @test nnzblocks(A) == nnzblocks(Ap)
-
-    @allowscalar for I in eachindex(C)
-      @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
-    end
-
-    A = dev(BlockSparseTensor(complex(elt), locs, indsA))
-    randn!(A)
-    @test conj(data(store(A))) == data(store(conj(A)))
-    @test typeof(conj(A)) <: BlockSparseTensor
-
-    @testset "Random constructor" begin
-      T = dev(randomBlockSparseTensor([(1, 1), (2, 2)], ([2, 2], [2, 2])))
-      @test nnzblocks(T) == 2
-      @test nnz(T) == 8
-      @test eltype(T) == elt
-      @test norm(T) ≉ 0
-
-      Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-      @test nnzblocks(Tc) == 2
-      @test nnz(Tc) == 8
-      @test eltype(Tc) == complex(elt)
-      @test norm(Tc) ≉ 0
-    end
-
-    @testset "Complex Valued Operations" begin
-      T = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-      rT = real(T)
-      @test eltype(rT) == elt
-      @test nnzblocks(rT) == nnzblocks(T)
-      iT = imag(T)
-      @test eltype(iT) == elt
-      @test nnzblocks(iT) == nnzblocks(T)
-      @test norm(rT)^2 + norm(iT)^2 ≈ norm(T)^2
-
-      cT = conj(T)
-      @test eltype(cT) == complex(elt)
-      @test nnzblocks(cT) == nnzblocks(T)
-    end
-    @testset "similartype regression test" begin
-      # Regression test for issue seen in:
-      # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
-      # Previously, `similartype` wasn't using information about the dimensions
-      # properly and was returning a `BlockSparse` storage of the dimensions
-      # of the input tensor.
-      T = dev(BlockSparseTensor([(1, 1)], ([2], [2])))
-      @test NDTensors.ndims(
-        NDTensors.storagetype(NDTensors.similartype(typeof(T), ([2], [2], [2])))
-      ) == 3
-    end
-
-    @testset "Random constructor" begin
-      T = dev(randomBlockSparseTensor([(1, 1), (2, 2)], ([2, 2], [2, 2])))
-      @test nnzblocks(T) == 2
-      @test nnz(T) == 8
-      @test eltype(T) == elt
-      @test norm(T) ≉ 0
-
-      Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-      @test nnzblocks(Tc) == 2
-      @test nnz(Tc) == 8
-      @test eltype(Tc) == complex(elt)
-      @test norm(Tc) ≉ 0
-    end
-
-    @testset "permute_combine" begin
-      indsA = ([2, 3], [4, 5], [6, 7, 8])
-      locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
-      A = dev(BlockSparseTensor(locsA, indsA...))
-      randn!(A)
-
-      B = NDTensors.permute_combine(A, 3, (2, 1))
-      @test nnzblocks(A) == nnzblocks(B)
-      @test nnz(A) == nnz(B)
-
-      Ap = NDTensors.permutedims(A, (3, 2, 1))
-
-      @allowscalar for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
-        blockAp = blockview(Ap, bAp)
-        blockB = blockview(B, bB)
-        @test reshape(blockAp, size(blockB)) == blockB
-      end
-    end
-  end
-
-  @testset "BlockSparseTensor setindex! add block" begin
-    T = BlockSparseTensor([2, 3], [4, 5])
-
-    @allowscalar for I in eachindex(T)
-      @test T[I] == 0.0
-    end
-    @test nnz(T) == 0
-    @test nnzblocks(T) == 0
-    @test !isblocknz(T, (1, 1))
-    @test !isblocknz(T, (2, 1))
-    @test !isblocknz(T, (1, 2))
-    @test !isblocknz(T, (2, 2))
-
-    T[1, 1] = 1.0
-
-    @test T[1, 1] == 1.0
-    @test nnz(T) == 8
-    @test nnzblocks(T) == 1
-    @test isblocknz(T, (1, 1))
-    @test !isblocknz(T, (2, 1))
-    @test !isblocknz(T, (1, 2))
-    @test !isblocknz(T, (2, 2))
-
-    T[4, 8] = 2.0
-
-    @test T[4, 8] == 2.0
-    @test nnz(T) == 8 + 15
-    @test nnzblocks(T) == 2
-    @test isblocknz(T, (1, 1))
-    @test !isblocknz(T, (2, 1))
-    @test !isblocknz(T, (1, 2))
-    @test isblocknz(T, (2, 2))
-
-    T[1, 6] = 3.0
-
-    @test T[1, 6] == 3.0
-    @test nnz(T) == 8 + 15 + 10
-    @test nnzblocks(T) == 3
-    @test isblocknz(T, (1, 1))
-    @test !isblocknz(T, (2, 1))
-    @test isblocknz(T, (1, 2))
-    @test isblocknz(T, (2, 2))
-
-    T[4, 2] = 4.0
-
-    @test T[4, 2] == 4.0
-    @test nnz(T) == 8 + 15 + 10 + 12
-    @test nnzblocks(T) == 4
-    @test isblocknz(T, (1, 1))
-    @test isblocknz(T, (2, 1))
-    @test isblocknz(T, (1, 2))
-    @test isblocknz(T, (2, 2))
-  end
-
-  @testset "svd on $dev" for dev in devs
-    @testset "svd example 1" begin
-      A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
-      randn!(A)
-      U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = default_rtol(eltype(A))
-    end
-
-    @testset "svd example 2" begin
-      A = dev(BlockSparseTensor([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
-      randn!(A)
-      U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = default_rtol(eltype(A))
-    end
-
-    @testset "svd example 3" begin
-      A = dev(BlockSparseTensor([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
-      randn!(A)
-      U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = default_rtol(eltype(A))
-    end
-
-    @testset "svd example 4" begin
-      A = dev(BlockSparseTensor([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
-      randn!(A)
-      U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = default_rtol(eltype(A))
-    end
-
-    @testset "svd example 5" begin
-      A = dev(BlockSparseTensor([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
-      randn!(A)
-      U, S, V = svd(A)
-      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = default_rtol(eltype(A))
-    end
-  end
-
-  @testset "exp" begin
-    A = BlockSparseTensor([(1, 1), (2, 2)], [2, 4], [2, 4])
-    randn!(A)
-    expT = exp(A)
-    @test array(expT) ≈ exp(array(A))
-    atol = default_rtol(eltype(A))
-
-    # Hermitian case
-    A = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
-    randn!(A)
-    Ah = BlockSparseTensor(ComplexF64, undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
-    for bA in eachnzblock(A)
-      b = blockview(A, bA)
-      blockview(Ah, bA) .= b + b'
-    end
-    expTh = exp(Hermitian(Ah))
-    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = default_rtol(eltype(Ah))
-
-    A = BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2])
-    @test_throws ErrorException exp(A)
-  end
-end
-
-nothing
diff --git a/NDTensors/test/BlockSparseArrays.jl b/NDTensors/test/test_BlockSparseArrays.jl
similarity index 100%
rename from NDTensors/test/BlockSparseArrays.jl
rename to NDTensors/test/test_BlockSparseArrays.jl
diff --git a/NDTensors/test/DiagonalArrays.jl b/NDTensors/test/test_DiagonalArrays.jl
similarity index 100%
rename from NDTensors/test/DiagonalArrays.jl
rename to NDTensors/test/test_DiagonalArrays.jl
diff --git a/NDTensors/test/NamedDimsArrays.jl b/NDTensors/test/test_NamedDimsArrays.jl
similarity index 100%
rename from NDTensors/test/NamedDimsArrays.jl
rename to NDTensors/test/test_NamedDimsArrays.jl
diff --git a/NDTensors/test/SetParameters.jl b/NDTensors/test/test_SetParameters.jl
similarity index 100%
rename from NDTensors/test/SetParameters.jl
rename to NDTensors/test/test_SetParameters.jl
diff --git a/NDTensors/test/SmallVectors.jl b/NDTensors/test/test_SmallVectors.jl
similarity index 100%
rename from NDTensors/test/SmallVectors.jl
rename to NDTensors/test/test_SmallVectors.jl
diff --git a/NDTensors/test/SortedSets.jl b/NDTensors/test/test_SortedSets.jl
similarity index 100%
rename from NDTensors/test/SortedSets.jl
rename to NDTensors/test/test_SortedSets.jl
diff --git a/NDTensors/test/TagSets.jl b/NDTensors/test/test_TagSets.jl
similarity index 100%
rename from NDTensors/test/TagSets.jl
rename to NDTensors/test/test_TagSets.jl
diff --git a/NDTensors/test/TensorAlgebra.jl b/NDTensors/test/test_TensorAlgebra.jl
similarity index 100%
rename from NDTensors/test/TensorAlgebra.jl
rename to NDTensors/test/test_TensorAlgebra.jl
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
new file mode 100644
index 0000000000..95fb44dccb
--- /dev/null
+++ b/NDTensors/test/test_blocksparse.jl
@@ -0,0 +1,301 @@
+using SafeTestsets: @safetestset
+
+@safetestset "test_blocksparse" begin
+  using NDTensors
+  using LinearAlgebra: exp, Hermitian, svd
+  using Test: @testset, @test, @test_throws
+  using GPUArraysCore: @allowscalar
+  include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+  using .NDTensorsTestUtils: NDTensorsTestUtils
+
+  @testset "BlockSparseTensor basic functionality" begin
+    C = nothing
+
+    @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float32, Float64]
+      if dev == NDTensors.mtl && elt == Float64
+        continue;
+      end
+      # Indices
+      indsA = ([2, 3], [4, 5])
+
+      # Locations of non-zero blocks
+      locs = [(1, 2), (2, 1)]
+
+      A = dev(BlockSparseTensor{elt}(locs, indsA...))
+      randn!(A)
+
+      @test blockdims(A, (1, 2)) == (2, 5)
+      @test blockdims(A, (2, 1)) == (3, 4)
+      @test nnzblocks(A) == 2
+      @test nnz(A) == 2 * 5 + 3 * 4
+      @test inds(A) == ([2, 3], [4, 5])
+      @test isblocknz(A, (2, 1))
+      @test isblocknz(A, (1, 2))
+      @test !isblocknz(A, (1, 1))
+      @test !isblocknz(A, (2, 2))
+
+      # Test different ways of getting nnz
+      @test nnz(blockoffsets(A), inds(A)) == nnz(A)
+
+      B = 2 * A
+      @test B[1, 1] == 2 * A[1, 1]
+      @test nnz(A) == 2 * 5 + 3 * 4
+      @test nnz(B) == 2 * 5 + 3 * 4
+      @test nnzblocks(A) == 2
+      @test nnzblocks(B) == 2
+
+      B = A / 2
+      @test B[1, 1] == A[1, 1] / 2
+      @test nnz(A) == 2 * 5 + 3 * 4
+      @test nnz(B) == 2 * 5 + 3 * 4
+      @test nnzblocks(A) == 2
+      @test nnzblocks(B) == 2
+
+      @allowscalar begin
+        A[1, 5] = 15
+        A[2, 5] = 25
+
+        @test A[1, 1] == 0
+        @test A[1, 5] == 15
+        @test A[2, 5] == 25
+      end
+      D = dense(A)
+
+      @allowscalar begin
+        @test D == A
+
+        for I in eachindex(A)
+          @test D[I] == A[I]
+        end
+      end
+
+      A12 = blockview(A, (1, 2))
+
+      @test dims(A12) == (2, 5)
+
+      @allowscalar for I in eachindex(A12)
+        @test A12[I] == A[I + CartesianIndex(0, 4)]
+      end
+
+      B = dev(BlockSparseTensor(undef, locs, indsA))
+      randn!(B)
+
+      C = A + B
+
+      @allowscalar for I in eachindex(C)
+        @test C[I] == A[I] + B[I]
+      end
+
+      Ap = permutedims(A, (2, 1))
+
+      @test blockdims(Ap, (1, 2)) == (4, 3)
+      @test blockdims(Ap, (2, 1)) == (5, 2)
+      @test nnz(A) == nnz(Ap)
+      @test nnzblocks(A) == nnzblocks(Ap)
+
+      @allowscalar for I in eachindex(C)
+        @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
+      end
+
+      A = dev(BlockSparseTensor(complex(elt), locs, indsA))
+      randn!(A)
+      @test conj(data(store(A))) == data(store(conj(A)))
+      @test typeof(conj(A)) <: BlockSparseTensor
+
+      @testset "Random constructor" for elt in [Float32, Float64]
+        T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+        @test nnzblocks(T) == 2
+        @test nnz(T) == 8
+        @test eltype(T) == elt
+        @test norm(T) ≉ 0
+
+        Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+        @test nnzblocks(Tc) == 2
+        @test nnz(Tc) == 8
+        @test eltype(Tc) == complex(elt)
+        @test norm(Tc) ≉ 0
+      end
+
+      @testset "Complex Valued Operations" for elt in [Float32, Float64]
+        T = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+        rT = real(T)
+        @test eltype(rT) == elt
+        @test nnzblocks(rT) == nnzblocks(T)
+        iT = imag(T)
+        @test eltype(iT) == elt
+        @test nnzblocks(iT) == nnzblocks(T)
+        @test norm(rT)^2 + norm(iT)^2 ≈ norm(T)^2
+
+        cT = conj(T)
+        @test eltype(cT) == complex(elt)
+        @test nnzblocks(cT) == nnzblocks(T)
+      end
+
+      @testset "similartype regression test" for elt in [Float32, Float64]
+        # Regression test for issue seen in:
+        # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
+        # Previously, `similartype` wasn't using information about the dimensions
+        # properly and was returning a `BlockSparse` storage of the dimensions
+        # of the input tensor.
+        T = dev(BlockSparseTensor(elt, [(1, 1)], ([2], [2])))
+        @test NDTensors.ndims(
+          NDTensors.storagetype(NDTensors.similartype(typeof(T), ([2], [2], [2])))
+        ) == 3
+      end
+
+      @testset "Random constructor" for elt in [Float32, Float64]
+        T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+        @test nnzblocks(T) == 2
+        @test nnz(T) == 8
+        @test eltype(T) == elt
+        @test norm(T) ≉ 0
+
+        Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+        @test nnzblocks(Tc) == 2
+        @test nnz(Tc) == 8
+        @test eltype(Tc) == complex(elt)
+        @test norm(Tc) ≉ 0
+      end
+
+      @testset "permute_combine" for elt in [Float32, Float64]
+        indsA = ([2, 3], [4, 5], [6, 7, 8])
+        locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
+        A = dev(BlockSparseTensor{elt}(locsA, indsA...))
+        randn!(A)
+
+        B = NDTensors.permute_combine(A, 3, (2, 1))
+        @test nnzblocks(A) == nnzblocks(B)
+        @test nnz(A) == nnz(B)
+
+        Ap = NDTensors.permutedims(A, (3, 2, 1))
+
+        @allowscalar for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
+          blockAp = blockview(Ap, bAp)
+          blockB = blockview(B, bB)
+          @test reshape(blockAp, size(blockB)) == blockB
+        end
+      end
+    end
+
+    @testset "BlockSparseTensor setindex! add block" begin
+      T = BlockSparseTensor([2, 3], [4, 5])
+
+      @allowscalar for I in eachindex(T)
+        @test T[I] == 0.0
+      end
+      @test nnz(T) == 0
+      @test nnzblocks(T) == 0
+      @test !isblocknz(T, (1, 1))
+      @test !isblocknz(T, (2, 1))
+      @test !isblocknz(T, (1, 2))
+      @test !isblocknz(T, (2, 2))
+
+      T[1, 1] = 1.0
+
+      @test T[1, 1] == 1.0
+      @test nnz(T) == 8
+      @test nnzblocks(T) == 1
+      @test isblocknz(T, (1, 1))
+      @test !isblocknz(T, (2, 1))
+      @test !isblocknz(T, (1, 2))
+      @test !isblocknz(T, (2, 2))
+
+      T[4, 8] = 2.0
+
+      @test T[4, 8] == 2.0
+      @test nnz(T) == 8 + 15
+      @test nnzblocks(T) == 2
+      @test isblocknz(T, (1, 1))
+      @test !isblocknz(T, (2, 1))
+      @test !isblocknz(T, (1, 2))
+      @test isblocknz(T, (2, 2))
+
+      T[1, 6] = 3.0
+
+      @test T[1, 6] == 3.0
+      @test nnz(T) == 8 + 15 + 10
+      @test nnzblocks(T) == 3
+      @test isblocknz(T, (1, 1))
+      @test !isblocknz(T, (2, 1))
+      @test isblocknz(T, (1, 2))
+      @test isblocknz(T, (2, 2))
+
+      T[4, 2] = 4.0
+
+      @test T[4, 2] == 4.0
+      @test nnz(T) == 8 + 15 + 10 + 12
+      @test nnzblocks(T) == 4
+      @test isblocknz(T, (1, 1))
+      @test isblocknz(T, (2, 1))
+      @test isblocknz(T, (1, 2))
+      @test isblocknz(T, (2, 2))
+    end
+
+    @testset "svd on $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float32, Float64]
+      if dev == NDTensors.mtl && elt == Float64
+        continue;
+      end
+      @testset "svd example 1" begin
+        A = dev(BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2]))
+        randn!(A)
+        U, S, V = svd(A)
+        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+        atol = NDTensorsTestUtils.default_rtol(elt)
+      end
+
+      @testset "svd example 2" begin
+        A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
+        randn!(A)
+        U, S, V = svd(A)
+        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+        atol = NDTensorsTestUtils.default_rtol(elt)
+      end
+
+      @testset "svd example 3" begin
+        A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
+        randn!(A)
+        U, S, V = svd(A)
+        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+        atol = NDTensorsTestUtils.default_rtol(elt)
+      end
+
+      @testset "svd example 4" begin
+        A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
+        randn!(A)
+        U, S, V = svd(A)
+        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+        atol = NDTensorsTestUtils.default_rtol(elt)
+      end
+
+      @testset "svd example 5" begin
+        A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
+        randn!(A)
+        U, S, V = svd(A)
+        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+        atol = NDTensorsTestUtils.default_rtol(elt)
+      end
+    end
+
+    @testset "exp" for elt in [Float32, Float64]
+      A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 4], [2, 4])
+      randn!(A)
+      expT = exp(A)
+      @test array(expT) ≈ exp(array(A))
+      atol = NDTensorsTestUtils.default_rtol(elt)
+
+      # Hermitian case
+      A = BlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+      randn!(A)
+      Ah = BlockSparseTensor(complex(elt), undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+      for bA in eachnzblock(A)
+        b = blockview(A, bA)
+        blockview(Ah, bA) .= b + b'
+      end
+      expTh = exp(Hermitian(Ah))
+      @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = NDTensorsTestUtils.default_rtol(eltype(Ah))
+
+      A = BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2])
+      @test_throws ErrorException exp(A)
+    end
+  end
+end
diff --git a/NDTensors/test/combiner.jl b/NDTensors/test/test_combiner.jl
similarity index 100%
rename from NDTensors/test/combiner.jl
rename to NDTensors/test/test_combiner.jl
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/test_dense.jl
similarity index 100%
rename from NDTensors/test/dense.jl
rename to NDTensors/test/test_dense.jl
diff --git a/NDTensors/test/diag.jl b/NDTensors/test/test_diag.jl
similarity index 100%
rename from NDTensors/test/diag.jl
rename to NDTensors/test/test_diag.jl
diff --git a/NDTensors/test/diagblocksparse.jl b/NDTensors/test/test_diagblocksparse.jl
similarity index 100%
rename from NDTensors/test/diagblocksparse.jl
rename to NDTensors/test/test_diagblocksparse.jl
diff --git a/NDTensors/test/emptynumber.jl b/NDTensors/test/test_emptynumber.jl
similarity index 100%
rename from NDTensors/test/emptynumber.jl
rename to NDTensors/test/test_emptynumber.jl
diff --git a/NDTensors/test/emptystorage.jl b/NDTensors/test/test_emptystorage.jl
similarity index 100%
rename from NDTensors/test/emptystorage.jl
rename to NDTensors/test/test_emptystorage.jl
diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
similarity index 100%
rename from NDTensors/test/linearalgebra.jl
rename to NDTensors/test/test_linearalgebra.jl
diff --git a/NDTensors/test/readwrite.jl b/NDTensors/test/test_readwrite.jl
similarity index 100%
rename from NDTensors/test/readwrite.jl
rename to NDTensors/test/test_readwrite.jl

From c61bc5b17b2dc94fa733f205e70de4ff3c9fcd3b Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 12:03:05 -0500
Subject: [PATCH 17/73] Fix spelling mistake

---
 NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl        | 3 +--
 .../{is_suppoted_eltype.jl => is_supported_eltype.jl}          | 0
 2 files changed, 1 insertion(+), 2 deletions(-)
 rename NDTensors/test/NDTensorsTestUtils/{is_suppoted_eltype.jl => is_supported_eltype.jl} (100%)

diff --git a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
index 5976ac8cc7..945c6653eb 100644
--- a/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
+++ b/NDTensors/test/NDTensorsTestUtils/NDTensorsTestUtils.jl
@@ -3,9 +3,8 @@ module NDTensorsTestUtils
 using NDTensors
 
 include("device_list.jl")
-include("is_suppoted_eltype.jl")
+include("is_supported_eltype.jl")
 
 default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
 
-export default_rtol, is_supported_eltype, devices_list
 end
diff --git a/NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl b/NDTensors/test/NDTensorsTestUtils/is_supported_eltype.jl
similarity index 100%
rename from NDTensors/test/NDTensorsTestUtils/is_suppoted_eltype.jl
rename to NDTensors/test/NDTensorsTestUtils/is_supported_eltype.jl

From 6530969ab88ab7c4e2946a90d0fcbab94603a82a Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 12:04:21 -0500
Subject: [PATCH 18/73] Update test files to create random modules and run in
 the modules. Also include only functions necessary to file

---
 NDTensors/src/Unwrap/test/runtests.jl         | 381 +++++++++---------
 .../test/{test_readwrite.jl => readwrite.jl}  |   1 +
 NDTensors/test/runtests.jl                    |  53 +--
 NDTensors/test/test_BlockSparseArrays.jl      |   3 +-
 NDTensors/test/test_DiagonalArrays.jl         |   3 +-
 NDTensors/test/test_NamedDimsArrays.jl        |   3 +-
 NDTensors/test/test_SetParameters.jl          |   3 +-
 NDTensors/test/test_SmallVectors.jl           |   3 +-
 NDTensors/test/test_SortedSets.jl             |   3 +-
 NDTensors/test/test_TagSets.jl                |   3 +-
 NDTensors/test/test_TensorAlgebra.jl          |   3 +-
 NDTensors/test/{Unwrap.jl => test_Unwrap.jl}  |   3 +-
 NDTensors/test/test_blocksparse.jl            |   4 +-
 NDTensors/test/test_combiner.jl               |  13 +-
 NDTensors/test/test_dense.jl                  |  13 +-
 NDTensors/test/test_diag.jl                   |  12 +-
 NDTensors/test/test_diagblocksparse.jl        |   6 +-
 NDTensors/test/test_emptynumber.jl            |   5 +-
 NDTensors/test/test_emptystorage.jl           |  12 +-
 NDTensors/test/test_linearalgebra.jl          |   6 +-
 .../{tupletools.jl => test_tupletools.jl}     |   6 +-
 21 files changed, 260 insertions(+), 279 deletions(-)
 rename NDTensors/test/{test_readwrite.jl => readwrite.jl} (96%)
 rename NDTensors/test/{Unwrap.jl => test_Unwrap.jl} (77%)
 rename NDTensors/test/{tupletools.jl => test_tupletools.jl} (65%)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index f742e150e6..f16e1edb37 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,195 +1,194 @@
-using Test
-using NDTensors.Unwrap
-using NDTensors
-using LinearAlgebra
-using GPUArraysCore: @allowscalar
-
-include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: devices_list
-
-@testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
-  elt in (Float32, ComplexF32)
-
-  v = dev(randn(elt, 10))
-  vt = transpose(v)
-  va = v'
-
-  E = expose(v)
-  Et = expose(vt)
-  Ea = expose(va)
-  v_type = typeof(v)
-  e_type = eltype(v)
-  @test typeof(E) == Exposed{v_type,v_type}
-  @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
-  @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
-
-  @test parent(E) == v
-  @test parent(Et) == v
-  @test parent(Ea) == v
-  @test transpose(E) == vt
-  @test cpu(E) == cpu(v)
-  @test cpu(Et) == cpu(vt)
-
-  m = reshape(v, (5, 2))
-  mt = transpose(m)
-  ma = m'
-  E = expose(m)
-  Et = expose(mt)
-  Ea = expose(ma)
-
-  m_type = typeof(m)
-  @test typeof(E) == Exposed{m_type,m_type}
-  @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
-  @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
-
-  o = dev(randn(elt, 1))
-  expose(o)[] = 2
-  @test expose(o)[] == 2
-
-  fill!(m, 0)
-  @test any(!Base.isinf, expose(m))
-
-  mp = copy(Ea)
-  @test mp == ma
-  fill!(ma, 2.0)
-  copyto!(expose(mp), expose(ma))
-  @test mp == ma
-
-  q, r = qr(expose(mp))
-  @test q * r ≈ mp
-
-  q, r = Unwrap.qr_positive(expose(mp))
-  @test q * r ≈ mp
-
-  square = dev(rand(real(elt), (10, 10)))
-  square = (square + transpose(square)) / 2
-  ## CUDA only supports Hermitian or Symmetric eigen decompositions
-  ## So I symmetrize square and call symetric here
-  l, U = eigen(expose(Symmetric(square)))
-  @test eltype(l) == real(elt)
-  @test eltype(U) == real(elt)
-  @test square * U ≈ U * Diagonal(l)
-
-  square = dev(rand(elt, (10, 10)))
-  # Can use `hermitianpart` in Julia 1.10
-  square = (square + square') / 2
-  ## CUDA only supports Hermitian or Symmetric eigen decompositions
-  ## So I symmetrize square and call symetric here
-  l, U = eigen(expose(Hermitian(square)))
-  @test eltype(l) == real(elt)
-  @test eltype(U) == elt
-  @test square * U ≈ U * Diagonal(l)
-
-  U, S, V, = svd(expose(mp))
-  @test eltype(U) == elt
-  @test eltype(S) == real(elt)
-  @test eltype(V) == elt
-  @test U * Diagonal(S) * V' ≈ mp
-
-  cm = dev(randn(elt, 2, 2))
-  mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
-  @test cm ≈ mp * mp'
-
-  @test permutedims(expose(mp), (2, 1)) == transpose(mp)
-  fill!(mt, 3)
-  permutedims!(expose(m), expose(mt), (2, 1))
-  @test norm(m) ≈ sqrt(3^2 * 10)
-  @test size(m) == (5, 2)
-  permutedims!(expose(m), expose(mt), (2, 1), +)
-  @test size(m) == (5, 2)
-  @test norm(m) ≈ sqrt(6^2 * 10)
-
-  m = reshape(m, (5, 2, 1))
-  mt = fill!(similar(m), 3.0)
-  m = permutedims(expose(m), (2, 1, 3))
-  @test size(m) == (2, 5, 1)
-  permutedims!(expose(m), expose(mt), (2, 1, 3))
-  @test norm(m) ≈ sqrt(3^2 * 10)
-  permutedims!(expose(m), expose(mt), (2, 1, 3), -)
-  @test norm(m) == 0
-
-  x = dev(rand(elt, 4, 4))
-  y = dev(rand(elt, 4, 4))
-  copyto!(expose(y), expose(x))
-  @test y == x
-
-  y = dev(rand(elt, 4, 4))
-  x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
-  copyto!(expose(y), expose(x))
-  @test NDTensors.cpu(y) == NDTensors.cpu(x)
-  @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
-
-  y = dev(rand(elt, 4, 4))
-  x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
-  copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
-
-  y = dev(randn(elt, 16))
-  x = reshape(dev(randn(elt, 4, 4))', 16)
-  copyto!(expose(y), expose(x))
-  @allowscalar begin
+using Test: @testset, @test, @test_broken
+  using NDTensors.Unwrap
+  using NDTensors: NDTensors, mul!!
+  using LinearAlgebra: LinearAlgebra, Transpose, qr, Symmetric, eigen, Hermitian, Diagonal, svd, mul!
+  using GPUArraysCore: @allowscalar
+  include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
+  using .NDTensorsTestUtils: NDTensorsTestUtils
+
+  @testset "Testing Unwrap $dev, $elt" for dev in NDTensorsTestUtils.devices_list(ARGS),
+    elt in (Float32, ComplexF32)
+
+    v = dev(randn(elt, 10))
+    vt = transpose(v)
+    va = v'
+
+    E = expose(v)
+    Et = expose(vt)
+    Ea = expose(va)
+    v_type = typeof(v)
+    e_type = eltype(v)
+    @test typeof(E) == Exposed{v_type,v_type}
+    @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
+    @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
+
+    @test parent(E) == v
+    @test parent(Et) == v
+    @test parent(Ea) == v
+    @test transpose(E) == vt
+    @test cpu(E) == cpu(v)
+    @test cpu(Et) == cpu(vt)
+
+    m = reshape(v, (5, 2))
+    mt = transpose(m)
+    ma = m'
+    E = expose(m)
+    Et = expose(mt)
+    Ea = expose(ma)
+
+    m_type = typeof(m)
+    @test typeof(E) == Exposed{m_type,m_type}
+    @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
+    @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
+
+    o = dev(randn(elt, 1))
+    expose(o)[] = 2
+    @test expose(o)[] == 2
+
+    fill!(m, 0)
+    @test any(!Base.isinf, expose(m))
+
+    mp = copy(Ea)
+    @test mp == ma
+    fill!(ma, 2.0)
+    copyto!(expose(mp), expose(ma))
+    @test mp == ma
+
+    q, r = qr(expose(mp))
+    @test q * r ≈ mp
+
+    q, r = Unwrap.qr_positive(expose(mp))
+    @test q * r ≈ mp
+
+    square = dev(rand(real(elt), (10, 10)))
+    square = (square + transpose(square)) / 2
+    ## CUDA only supports Hermitian or Symmetric eigen decompositions
+    ## So I symmetrize square and call symetric here
+    l, U = eigen(expose(Symmetric(square)))
+    @test eltype(l) == real(elt)
+    @test eltype(U) == real(elt)
+    @test square * U ≈ U * Diagonal(l)
+
+    square = dev(rand(elt, (10, 10)))
+    # Can use `hermitianpart` in Julia 1.10
+    square = (square + square') / 2
+    ## CUDA only supports Hermitian or Symmetric eigen decompositions
+    ## So I symmetrize square and call symetric here
+    l, U = eigen(expose(Hermitian(square)))
+    @test eltype(l) == real(elt)
+    @test eltype(U) == elt
+    @test square * U ≈ U * Diagonal(l)
+
+    U, S, V, = svd(expose(mp))
+    @test eltype(U) == elt
+    @test eltype(S) == real(elt)
+    @test eltype(V) == elt
+    @test U * Diagonal(S) * V' ≈ mp
+
+    cm = dev(randn(elt, 2, 2))
+    mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
+    @test cm ≈ mp * mp'
+
+    @test permutedims(expose(mp), (2, 1)) == transpose(mp)
+    fill!(mt, 3)
+    permutedims!(expose(m), expose(mt), (2, 1))
+    @test norm(m) ≈ sqrt(3^2 * 10)
+    @test size(m) == (5, 2)
+    permutedims!(expose(m), expose(mt), (2, 1), +)
+    @test size(m) == (5, 2)
+    @test norm(m) ≈ sqrt(6^2 * 10)
+
+    m = reshape(m, (5, 2, 1))
+    mt = fill!(similar(m), 3.0)
+    m = permutedims(expose(m), (2, 1, 3))
+    @test size(m) == (2, 5, 1)
+    permutedims!(expose(m), expose(mt), (2, 1, 3))
+    @test norm(m) ≈ sqrt(3^2 * 10)
+    permutedims!(expose(m), expose(mt), (2, 1, 3), -)
+    @test norm(m) == 0
+
+    x = dev(rand(elt, 4, 4))
+    y = dev(rand(elt, 4, 4))
+    copyto!(expose(y), expose(x))
     @test y == x
-    @test copy(x) == x
-  end
 
-  y = dev(randn(elt, 8))
-  x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
-  copyto!(expose(y), expose(x))
-  @allowscalar begin
+    y = dev(rand(elt, 4, 4))
+    x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
+    copyto!(expose(y), expose(x))
+    @test NDTensors.cpu(y) == NDTensors.cpu(x)
+    @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
+
+    y = dev(rand(elt, 4, 4))
+    x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
+    copyto!(expose(y), expose(x))
     @test y == x
     @test copy(x) == x
-  end
-
-  y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
-  x = dev(randn(elt, 4, 4))
-  permutedims!(expose(y), expose(x), (2, 1))
-  @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
-
-  ##########################################
-  ### Testing an issue with CUDA&Metal transpose/adjoint mul
-  A = dev(randn(elt, (3, 2)))
-  B = dev(randn(elt, (3, 4)))
-  C = dev(randn(elt, (4, 2)))
-  Cp = copy(C)
-
-  ## This fails with scalar indexing
-  if dev != NDTensors.cpu
-    @test_broken mul!(transpose(C), transpose(A), B, true, false)
-  end
-  mul!(C, transpose(B), A, true, false)
-  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
-  @test C ≈ Cp
-  Cp = zero(C)
-  ## Try calling mul!! with transposes to verify that code works
-  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
-  @test transpose(Cpt) ≈ C
-
-  Cp = zero(C)
-  ## This fails with scalar indexing 
-  if dev != NDTensors.cpu
-    @test_broken mul!(C', A', B, true, false)
-  end
-  mul!(C, B', A, true, false)
-  mul!(expose(Cp'), expose(A'), expose(B), true, false)
-  @test C ≈ Cp
-  Cp = zero(C)
-  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
-  @test Cpt' ≈ C
-
-  ##################################
-  ### Add test for transpose(reshape(adjoint )) failure in CUDA
-
-  A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
-  B = dev(randn(elt, 2, 2))
-  C = dev(zeros(elt, 2, 12))
-  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
-  Cp = NDTensors.cpu(similar(C))
-  NDTensors.mul!(
-    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
-  )
-  @test NDTensors.cpu(C) ≈ Cp
-  NDTensors.zero(C)
-  NDTensors.mul!!(C, B, A, true, false)
-  @test NDTensors.cpu(C) ≈ Cp
-end
+
+    y = dev(randn(elt, 16))
+    x = reshape(dev(randn(elt, 4, 4))', 16)
+    copyto!(expose(y), expose(x))
+    @allowscalar begin
+      @test y == x
+      @test copy(x) == x
+    end
+
+    y = dev(randn(elt, 8))
+    x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
+    copyto!(expose(y), expose(x))
+    @allowscalar begin
+      @test y == x
+      @test copy(x) == x
+    end
+
+    y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
+    x = dev(randn(elt, 4, 4))
+    permutedims!(expose(y), expose(x), (2, 1))
+    @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+
+    ##########################################
+    ### Testing an issue with CUDA&Metal transpose/adjoint mul
+    A = dev(randn(elt, (3, 2)))
+    B = dev(randn(elt, (3, 4)))
+    C = dev(randn(elt, (4, 2)))
+    Cp = copy(C)
+
+    ## This fails with scalar indexing
+    if dev != NDTensors.cpu
+      @test_broken mul!(transpose(C), transpose(A), B, true, false)
+    end
+    mul!(C, transpose(B), A, true, false)
+    mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+    @test C ≈ Cp
+    Cp = zero(C)
+    ## Try calling mul!! with transposes to verify that code works
+    Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+    @test transpose(Cpt) ≈ C
+
+    Cp = zero(C)
+    ## This fails with scalar indexing 
+    if dev != NDTensors.cpu
+      @test_broken mul!(C', A', B, true, false)
+    end
+    mul!(C, B', A, true, false)
+    mul!(expose(Cp'), expose(A'), expose(B), true, false)
+    @test C ≈ Cp
+    Cp = zero(C)
+    Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+    @test Cpt' ≈ C
+
+    ##################################
+    ### Add test for transpose(reshape(adjoint )) failure in CUDA
+
+    A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
+    B = dev(randn(elt, 2, 2))
+    C = dev(zeros(elt, 2, 12))
+    NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
+    Cp = NDTensors.cpu(similar(C))
+    NDTensors.mul!(
+      expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
+    )
+    @test NDTensors.cpu(C) ≈ Cp
+    NDTensors.zero(C)
+    NDTensors.mul!!(C, B, A, true, false)
+    @test NDTensors.cpu(C) ≈ Cp
+  end
\ No newline at end of file
diff --git a/NDTensors/test/test_readwrite.jl b/NDTensors/test/readwrite.jl
similarity index 96%
rename from NDTensors/test/test_readwrite.jl
rename to NDTensors/test/readwrite.jl
index 4122a65b52..a79910eb14 100644
--- a/NDTensors/test/test_readwrite.jl
+++ b/NDTensors/test/readwrite.jl
@@ -1,3 +1,4 @@
+## TODO this file doesn't seem to work properly
 using NDTensors, Test
 using HDF5
 
diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index 8e1cb7217f..deff1f516d 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -1,48 +1,15 @@
-using Test
-using SafeTestsets
-
-include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: default_rtol, devices_list
-
-println("Passing arguments ARGS=$(ARGS) to test.")
-
-if isempty(ARGS) || "base" in ARGS
-  println(
-    """\nArguments ARGS = $(ARGS) are empty, or contain `"base"`. Running cpu NDTensors tests.""",
-  )
-end
-if "cuda" in ARGS || "all" in ARGS
-  println("""\nArguments ARGS = $(ARGS) contain `"cuda"`. Running NDTensorCUDA tests.""")
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  println("""\nArguments ARGS = $(ARGS) contain`"metal"`. Running NDTensorMetal tests.""")
-  using Metal
-end
+using SafeTestsets: @safetestset
 
 @safetestset "NDTensors" begin
-  @testset "$filename" for filename in [
-    "BlockSparseArrays.jl",
-    "DiagonalArrays.jl",
-    "SetParameters.jl",
-    "SmallVectors.jl",
-    "SortedSets.jl",
-    "TagSets.jl",
-    "TensorAlgebra.jl",
-    "Unwrap.jl",
-    "linearalgebra.jl",
-    "dense.jl",
-    "blocksparse.jl",
-    "diagblocksparse.jl",
-    "diag.jl",
-    "emptynumber.jl",
-    "emptystorage.jl",
-    "combiner.jl",
-    "arraytensor/arraytensor.jl",
-    "ITensors/runtests.jl",
-  ]
-    println("Running $filename")
-    include(filename)
+  using Test: @testset
+  @testset "$(@__DIR__)" begin
+    filenames = filter(readdir(@__DIR__)) do f
+      startswith("test_")(f) && endswith(".jl")(f)
+    end
+    @testset "Test $(@__DIR__)/$filename" for filename in filenames
+      println("Running $(@__DIR__)/$filename")
+      include(filename)
+    end
   end
   if "cuda" in ARGS || "all" in ARGS
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorCUDA.jl"))
diff --git a/NDTensors/test/test_BlockSparseArrays.jl b/NDTensors/test/test_BlockSparseArrays.jl
index 5d1345d0e7..c372133850 100644
--- a/NDTensors/test/test_BlockSparseArrays.jl
+++ b/NDTensors/test/test_BlockSparseArrays.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "BlockSparseArrays", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_DiagonalArrays.jl b/NDTensors/test/test_DiagonalArrays.jl
index fa9b5a0c2b..332b886757 100644
--- a/NDTensors/test/test_DiagonalArrays.jl
+++ b/NDTensors/test/test_DiagonalArrays.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "DiagonalArrays", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_NamedDimsArrays.jl b/NDTensors/test/test_NamedDimsArrays.jl
index ce0ed1da77..229ca5138a 100644
--- a/NDTensors/test/test_NamedDimsArrays.jl
+++ b/NDTensors/test/test_NamedDimsArrays.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "NamedDimsArrays", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_SetParameters.jl b/NDTensors/test/test_SetParameters.jl
index a8026805fe..46ea9007e2 100644
--- a/NDTensors/test/test_SetParameters.jl
+++ b/NDTensors/test/test_SetParameters.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SetParameters", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_SmallVectors.jl b/NDTensors/test/test_SmallVectors.jl
index 62b552dc72..745ac80d25 100644
--- a/NDTensors/test/test_SmallVectors.jl
+++ b/NDTensors/test/test_SmallVectors.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SmallVectors", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_SortedSets.jl b/NDTensors/test/test_SortedSets.jl
index e5a885737d..a3008de963 100644
--- a/NDTensors/test/test_SortedSets.jl
+++ b/NDTensors/test/test_SortedSets.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SortedSets", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_TagSets.jl b/NDTensors/test/test_TagSets.jl
index 3ce0fbfd98..e9f81c13d1 100644
--- a/NDTensors/test/test_TagSets.jl
+++ b/NDTensors/test/test_TagSets.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "TagSets", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_TensorAlgebra.jl b/NDTensors/test/test_TensorAlgebra.jl
index 4cd51cf4cb..d916c75f2a 100644
--- a/NDTensors/test/test_TensorAlgebra.jl
+++ b/NDTensors/test/test_TensorAlgebra.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "TensorAlgebra", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/Unwrap.jl b/NDTensors/test/test_Unwrap.jl
similarity index 77%
rename from NDTensors/test/Unwrap.jl
rename to NDTensors/test/test_Unwrap.jl
index a7a49a2a17..ebb82ff242 100644
--- a/NDTensors/test/Unwrap.jl
+++ b/NDTensors/test/test_Unwrap.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "Unwrap", "test", "runtests.jl"))
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 95fb44dccb..3c1ecebb34 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -1,6 +1,4 @@
-using SafeTestsets: @safetestset
-
-@safetestset "test_blocksparse" begin
+@eval module $(gensym())
   using NDTensors
   using LinearAlgebra: exp, Hermitian, svd
   using Test: @testset, @test, @test_throws
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index 9c0f21fb75..f59333d94f 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -1,17 +1,15 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
-## TODO headergaurd
-# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-# using .NDTensorsTestUtils: default_rtol, devices_list
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
 # Testing generic block indices
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
     @testset "Dense * Combiner" begin
       d = 2
       input_tensor_inds = (d, d, d)
@@ -88,3 +86,4 @@ using ITensors: QN, Index
     end
   end
 end
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index 73fbae6a2c..105d75d03a 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -1,13 +1,13 @@
+@eval module $(gensym())
 using NDTensors
-using Test
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
-## TODO headergaurd
-#include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-#using .NDTensorsTestUtils: default_rtol, devices_list
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "Dense Tensors" begin
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+    elt = dev == NDTensors.mtl ? Float32 : Float64
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
       A = dev(Tensor(elt, (3, 4)))
@@ -276,3 +276,4 @@ using GPUArraysCore: @allowscalar
 end
 
 nothing
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_diag.jl b/NDTensors/test/test_diag.jl
index 62540485d1..849822e364 100644
--- a/NDTensors/test/test_diag.jl
+++ b/NDTensors/test/test_diag.jl
@@ -1,13 +1,12 @@
+@eval module $(gensym())
 using NDTensors
-using Test
+using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
-## TODO headergaurd
-# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-# using .NDTensorsTestUtils: default_rtol, devices_list
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "DiagTensor basic functionality" begin
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs,
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
     elt in (Float32, ComplexF32, Float64, ComplexF64)
 
     if dev == NDTensors.mtl && real(elt) ≠ Float32
@@ -66,3 +65,4 @@ end
   @test contract(A, (-2, 1), t, (-2, 3)) == transpose(A)
 end
 nothing
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_diagblocksparse.jl b/NDTensors/test/test_diagblocksparse.jl
index 94cc3e6ff4..070641b6c1 100644
--- a/NDTensors/test/test_diagblocksparse.jl
+++ b/NDTensors/test/test_diagblocksparse.jl
@@ -1,6 +1,7 @@
-using Dictionaries
+@eval module $(gensym())
+using Dictionaries: Dictionary
 using NDTensors
-using Test
+using Test: @testset, @test
 
 @testset "UniformDiagBlockSparseTensor basic functionality" begin
   NeverAlias = NDTensors.NeverAlias
@@ -24,3 +25,4 @@ using Test
   @test conj(NeverAlias(), tensor)[1, 1] == conj(c)
   @test conj(AllowAlias(), tensor)[1, 1] == conj(c)
 end
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_emptynumber.jl b/NDTensors/test/test_emptynumber.jl
index 34cdb8e2ce..a1806e6a39 100644
--- a/NDTensors/test/test_emptynumber.jl
+++ b/NDTensors/test/test_emptynumber.jl
@@ -1,6 +1,6 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using Test: @testset, @test, @test_throws
 
 const 𝟎 = NDTensors.EmptyNumber()
 
@@ -29,3 +29,4 @@ const 𝟎 = NDTensors.EmptyNumber()
   @test norm(𝟎) == 0.0
   @test norm(𝟎) isa Float64
 end
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_emptystorage.jl b/NDTensors/test/test_emptystorage.jl
index 7c8651cb62..272f203632 100644
--- a/NDTensors/test/test_emptystorage.jl
+++ b/NDTensors/test/test_emptystorage.jl
@@ -1,12 +1,11 @@
+@eval module $(gensym())
 using NDTensors
-using Test
-## TODO headergaurd
-# include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-# using .NDTensorsTestUtils: devices_list
+using Test: @testset, @test
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "EmptyStorage test" begin
-  devs = devices_list(copy(ARGS))
-  @testset "test device: $dev" for dev in devs
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
     T = dev(Tensor(EmptyStorage(NDTensors.EmptyNumber), (2, 2)))
     @test size(T) == (2, 2)
     @test eltype(T) == NDTensors.EmptyNumber
@@ -33,3 +32,4 @@ using Test
     @test zero(T) isa typeof(T)
   end
 end
+end
\ No newline at end of file
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index 051d7c03c5..94b64400f4 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -1,6 +1,7 @@
+@eval module $(gensym())
 using NDTensors
-using LinearAlgebra
-using Test
+using LinearAlgebra: Diagonal, qr, diag
+using Test: @testset, @test
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: default_rtol, devices_list
@@ -88,3 +89,4 @@ devs = devices_list(copy(ARGS))
 end
 
 nothing
+end
\ No newline at end of file
diff --git a/NDTensors/test/tupletools.jl b/NDTensors/test/test_tupletools.jl
similarity index 65%
rename from NDTensors/test/tupletools.jl
rename to NDTensors/test/test_tupletools.jl
index f8e0ec1ffa..3e9547f757 100644
--- a/NDTensors/test/tupletools.jl
+++ b/NDTensors/test/test_tupletools.jl
@@ -1,5 +1,6 @@
-using Test
-using NDTensors
+@eval module $(gensym())
+using Test: @testset, @test
+using NDTensors: NDTensors
 
 @testset "Test non-exported tuple tools" begin
   @test NDTensors.diff((1, 3, 6, 4)) == (2, 3, -2)
@@ -7,3 +8,4 @@ using NDTensors
 end
 
 nothing
+end
\ No newline at end of file

From a792341c36e25fb39e39ae10d423ad7984fcb4df Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 12:05:00 -0500
Subject: [PATCH 19/73] format

---
 NDTensors/src/Unwrap/test/runtests.jl    | 379 ++++++++---------
 NDTensors/test/test_BlockSparseArrays.jl |   2 +-
 NDTensors/test/test_DiagonalArrays.jl    |   2 +-
 NDTensors/test/test_NamedDimsArrays.jl   |   2 +-
 NDTensors/test/test_SetParameters.jl     |   2 +-
 NDTensors/test/test_SmallVectors.jl      |   2 +-
 NDTensors/test/test_SortedSets.jl        |   2 +-
 NDTensors/test/test_TagSets.jl           |   2 +-
 NDTensors/test/test_TensorAlgebra.jl     |   2 +-
 NDTensors/test/test_Unwrap.jl            |   2 +-
 NDTensors/test/test_blocksparse.jl       | 518 ++++++++++++-----------
 NDTensors/test/test_combiner.jl          |   2 +-
 NDTensors/test/test_dense.jl             |   2 +-
 NDTensors/test/test_diag.jl              |   2 +-
 NDTensors/test/test_diagblocksparse.jl   |   2 +-
 NDTensors/test/test_emptynumber.jl       |   2 +-
 NDTensors/test/test_emptystorage.jl      |   2 +-
 NDTensors/test/test_linearalgebra.jl     |   2 +-
 NDTensors/test/test_tupletools.jl        |   2 +-
 19 files changed, 469 insertions(+), 462 deletions(-)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index f16e1edb37..6f2e3bc0fd 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,194 +1,195 @@
 using Test: @testset, @test, @test_broken
-  using NDTensors.Unwrap
-  using NDTensors: NDTensors, mul!!
-  using LinearAlgebra: LinearAlgebra, Transpose, qr, Symmetric, eigen, Hermitian, Diagonal, svd, mul!
-  using GPUArraysCore: @allowscalar
-  include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
-  using .NDTensorsTestUtils: NDTensorsTestUtils
-
-  @testset "Testing Unwrap $dev, $elt" for dev in NDTensorsTestUtils.devices_list(ARGS),
-    elt in (Float32, ComplexF32)
-
-    v = dev(randn(elt, 10))
-    vt = transpose(v)
-    va = v'
-
-    E = expose(v)
-    Et = expose(vt)
-    Ea = expose(va)
-    v_type = typeof(v)
-    e_type = eltype(v)
-    @test typeof(E) == Exposed{v_type,v_type}
-    @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
-    @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
-
-    @test parent(E) == v
-    @test parent(Et) == v
-    @test parent(Ea) == v
-    @test transpose(E) == vt
-    @test cpu(E) == cpu(v)
-    @test cpu(Et) == cpu(vt)
-
-    m = reshape(v, (5, 2))
-    mt = transpose(m)
-    ma = m'
-    E = expose(m)
-    Et = expose(mt)
-    Ea = expose(ma)
-
-    m_type = typeof(m)
-    @test typeof(E) == Exposed{m_type,m_type}
-    @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
-    @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
-
-    o = dev(randn(elt, 1))
-    expose(o)[] = 2
-    @test expose(o)[] == 2
-
-    fill!(m, 0)
-    @test any(!Base.isinf, expose(m))
-
-    mp = copy(Ea)
-    @test mp == ma
-    fill!(ma, 2.0)
-    copyto!(expose(mp), expose(ma))
-    @test mp == ma
-
-    q, r = qr(expose(mp))
-    @test q * r ≈ mp
-
-    q, r = Unwrap.qr_positive(expose(mp))
-    @test q * r ≈ mp
-
-    square = dev(rand(real(elt), (10, 10)))
-    square = (square + transpose(square)) / 2
-    ## CUDA only supports Hermitian or Symmetric eigen decompositions
-    ## So I symmetrize square and call symetric here
-    l, U = eigen(expose(Symmetric(square)))
-    @test eltype(l) == real(elt)
-    @test eltype(U) == real(elt)
-    @test square * U ≈ U * Diagonal(l)
-
-    square = dev(rand(elt, (10, 10)))
-    # Can use `hermitianpart` in Julia 1.10
-    square = (square + square') / 2
-    ## CUDA only supports Hermitian or Symmetric eigen decompositions
-    ## So I symmetrize square and call symetric here
-    l, U = eigen(expose(Hermitian(square)))
-    @test eltype(l) == real(elt)
-    @test eltype(U) == elt
-    @test square * U ≈ U * Diagonal(l)
-
-    U, S, V, = svd(expose(mp))
-    @test eltype(U) == elt
-    @test eltype(S) == real(elt)
-    @test eltype(V) == elt
-    @test U * Diagonal(S) * V' ≈ mp
-
-    cm = dev(randn(elt, 2, 2))
-    mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
-    @test cm ≈ mp * mp'
-
-    @test permutedims(expose(mp), (2, 1)) == transpose(mp)
-    fill!(mt, 3)
-    permutedims!(expose(m), expose(mt), (2, 1))
-    @test norm(m) ≈ sqrt(3^2 * 10)
-    @test size(m) == (5, 2)
-    permutedims!(expose(m), expose(mt), (2, 1), +)
-    @test size(m) == (5, 2)
-    @test norm(m) ≈ sqrt(6^2 * 10)
-
-    m = reshape(m, (5, 2, 1))
-    mt = fill!(similar(m), 3.0)
-    m = permutedims(expose(m), (2, 1, 3))
-    @test size(m) == (2, 5, 1)
-    permutedims!(expose(m), expose(mt), (2, 1, 3))
-    @test norm(m) ≈ sqrt(3^2 * 10)
-    permutedims!(expose(m), expose(mt), (2, 1, 3), -)
-    @test norm(m) == 0
-
-    x = dev(rand(elt, 4, 4))
-    y = dev(rand(elt, 4, 4))
-    copyto!(expose(y), expose(x))
+using NDTensors.Unwrap
+using NDTensors: NDTensors, mul!!
+using LinearAlgebra:
+  LinearAlgebra, Transpose, qr, Symmetric, eigen, Hermitian, Diagonal, svd, mul!
+using GPUArraysCore: @allowscalar
+include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
+
+@testset "Testing Unwrap $dev, $elt" for dev in NDTensorsTestUtils.devices_list(ARGS),
+  elt in (Float32, ComplexF32)
+
+  v = dev(randn(elt, 10))
+  vt = transpose(v)
+  va = v'
+
+  E = expose(v)
+  Et = expose(vt)
+  Ea = expose(va)
+  v_type = typeof(v)
+  e_type = eltype(v)
+  @test typeof(E) == Exposed{v_type,v_type}
+  @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
+  @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
+
+  @test parent(E) == v
+  @test parent(Et) == v
+  @test parent(Ea) == v
+  @test transpose(E) == vt
+  @test cpu(E) == cpu(v)
+  @test cpu(Et) == cpu(vt)
+
+  m = reshape(v, (5, 2))
+  mt = transpose(m)
+  ma = m'
+  E = expose(m)
+  Et = expose(mt)
+  Ea = expose(ma)
+
+  m_type = typeof(m)
+  @test typeof(E) == Exposed{m_type,m_type}
+  @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
+  @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
+
+  o = dev(randn(elt, 1))
+  expose(o)[] = 2
+  @test expose(o)[] == 2
+
+  fill!(m, 0)
+  @test any(!Base.isinf, expose(m))
+
+  mp = copy(Ea)
+  @test mp == ma
+  fill!(ma, 2.0)
+  copyto!(expose(mp), expose(ma))
+  @test mp == ma
+
+  q, r = qr(expose(mp))
+  @test q * r ≈ mp
+
+  q, r = Unwrap.qr_positive(expose(mp))
+  @test q * r ≈ mp
+
+  square = dev(rand(real(elt), (10, 10)))
+  square = (square + transpose(square)) / 2
+  ## CUDA only supports Hermitian or Symmetric eigen decompositions
+  ## So I symmetrize square and call symetric here
+  l, U = eigen(expose(Symmetric(square)))
+  @test eltype(l) == real(elt)
+  @test eltype(U) == real(elt)
+  @test square * U ≈ U * Diagonal(l)
+
+  square = dev(rand(elt, (10, 10)))
+  # Can use `hermitianpart` in Julia 1.10
+  square = (square + square') / 2
+  ## CUDA only supports Hermitian or Symmetric eigen decompositions
+  ## So I symmetrize square and call symetric here
+  l, U = eigen(expose(Hermitian(square)))
+  @test eltype(l) == real(elt)
+  @test eltype(U) == elt
+  @test square * U ≈ U * Diagonal(l)
+
+  U, S, V, = svd(expose(mp))
+  @test eltype(U) == elt
+  @test eltype(S) == real(elt)
+  @test eltype(V) == elt
+  @test U * Diagonal(S) * V' ≈ mp
+
+  cm = dev(randn(elt, 2, 2))
+  mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
+  @test cm ≈ mp * mp'
+
+  @test permutedims(expose(mp), (2, 1)) == transpose(mp)
+  fill!(mt, 3)
+  permutedims!(expose(m), expose(mt), (2, 1))
+  @test norm(m) ≈ sqrt(3^2 * 10)
+  @test size(m) == (5, 2)
+  permutedims!(expose(m), expose(mt), (2, 1), +)
+  @test size(m) == (5, 2)
+  @test norm(m) ≈ sqrt(6^2 * 10)
+
+  m = reshape(m, (5, 2, 1))
+  mt = fill!(similar(m), 3.0)
+  m = permutedims(expose(m), (2, 1, 3))
+  @test size(m) == (2, 5, 1)
+  permutedims!(expose(m), expose(mt), (2, 1, 3))
+  @test norm(m) ≈ sqrt(3^2 * 10)
+  permutedims!(expose(m), expose(mt), (2, 1, 3), -)
+  @test norm(m) == 0
+
+  x = dev(rand(elt, 4, 4))
+  y = dev(rand(elt, 4, 4))
+  copyto!(expose(y), expose(x))
+  @test y == x
+
+  y = dev(rand(elt, 4, 4))
+  x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
+  copyto!(expose(y), expose(x))
+  @test NDTensors.cpu(y) == NDTensors.cpu(x)
+  @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
+
+  y = dev(rand(elt, 4, 4))
+  x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
+  copyto!(expose(y), expose(x))
+  @test y == x
+  @test copy(x) == x
+
+  y = dev(randn(elt, 16))
+  x = reshape(dev(randn(elt, 4, 4))', 16)
+  copyto!(expose(y), expose(x))
+  @allowscalar begin
     @test y == x
+    @test copy(x) == x
+  end
 
-    y = dev(rand(elt, 4, 4))
-    x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
-    copyto!(expose(y), expose(x))
-    @test NDTensors.cpu(y) == NDTensors.cpu(x)
-    @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
-
-    y = dev(rand(elt, 4, 4))
-    x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
-    copyto!(expose(y), expose(x))
+  y = dev(randn(elt, 8))
+  x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
+  copyto!(expose(y), expose(x))
+  @allowscalar begin
     @test y == x
     @test copy(x) == x
-
-    y = dev(randn(elt, 16))
-    x = reshape(dev(randn(elt, 4, 4))', 16)
-    copyto!(expose(y), expose(x))
-    @allowscalar begin
-      @test y == x
-      @test copy(x) == x
-    end
-
-    y = dev(randn(elt, 8))
-    x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
-    copyto!(expose(y), expose(x))
-    @allowscalar begin
-      @test y == x
-      @test copy(x) == x
-    end
-
-    y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
-    x = dev(randn(elt, 4, 4))
-    permutedims!(expose(y), expose(x), (2, 1))
-    @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
-
-    ##########################################
-    ### Testing an issue with CUDA&Metal transpose/adjoint mul
-    A = dev(randn(elt, (3, 2)))
-    B = dev(randn(elt, (3, 4)))
-    C = dev(randn(elt, (4, 2)))
-    Cp = copy(C)
-
-    ## This fails with scalar indexing
-    if dev != NDTensors.cpu
-      @test_broken mul!(transpose(C), transpose(A), B, true, false)
-    end
-    mul!(C, transpose(B), A, true, false)
-    mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
-    @test C ≈ Cp
-    Cp = zero(C)
-    ## Try calling mul!! with transposes to verify that code works
-    Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
-    @test transpose(Cpt) ≈ C
-
-    Cp = zero(C)
-    ## This fails with scalar indexing 
-    if dev != NDTensors.cpu
-      @test_broken mul!(C', A', B, true, false)
-    end
-    mul!(C, B', A, true, false)
-    mul!(expose(Cp'), expose(A'), expose(B), true, false)
-    @test C ≈ Cp
-    Cp = zero(C)
-    Cpt = NDTensors.mul!!(Cp', A', B, true, false)
-    @test Cpt' ≈ C
-
-    ##################################
-    ### Add test for transpose(reshape(adjoint )) failure in CUDA
-
-    A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
-    B = dev(randn(elt, 2, 2))
-    C = dev(zeros(elt, 2, 12))
-    NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
-    Cp = NDTensors.cpu(similar(C))
-    NDTensors.mul!(
-      expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
-    )
-    @test NDTensors.cpu(C) ≈ Cp
-    NDTensors.zero(C)
-    NDTensors.mul!!(C, B, A, true, false)
-    @test NDTensors.cpu(C) ≈ Cp
-  end
\ No newline at end of file
+  end
+
+  y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
+  x = dev(randn(elt, 4, 4))
+  permutedims!(expose(y), expose(x), (2, 1))
+  @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+
+  ##########################################
+  ### Testing an issue with CUDA&Metal transpose/adjoint mul
+  A = dev(randn(elt, (3, 2)))
+  B = dev(randn(elt, (3, 4)))
+  C = dev(randn(elt, (4, 2)))
+  Cp = copy(C)
+
+  ## This fails with scalar indexing
+  if dev != NDTensors.cpu
+    @test_broken mul!(transpose(C), transpose(A), B, true, false)
+  end
+  mul!(C, transpose(B), A, true, false)
+  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  ## Try calling mul!! with transposes to verify that code works
+  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  @test transpose(Cpt) ≈ C
+
+  Cp = zero(C)
+  ## This fails with scalar indexing 
+  if dev != NDTensors.cpu
+    @test_broken mul!(C', A', B, true, false)
+  end
+  mul!(C, B', A, true, false)
+  mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  @test Cpt' ≈ C
+
+  ##################################
+  ### Add test for transpose(reshape(adjoint )) failure in CUDA
+
+  A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
+  B = dev(randn(elt, 2, 2))
+  C = dev(zeros(elt, 2, 12))
+  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
+  Cp = NDTensors.cpu(similar(C))
+  NDTensors.mul!(
+    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
+  )
+  @test NDTensors.cpu(C) ≈ Cp
+  NDTensors.zero(C)
+  NDTensors.mul!!(C, B, A, true, false)
+  @test NDTensors.cpu(C) ≈ Cp
+end
diff --git a/NDTensors/test/test_BlockSparseArrays.jl b/NDTensors/test/test_BlockSparseArrays.jl
index c372133850..67683ef1cc 100644
--- a/NDTensors/test/test_BlockSparseArrays.jl
+++ b/NDTensors/test/test_BlockSparseArrays.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "BlockSparseArrays", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_DiagonalArrays.jl b/NDTensors/test/test_DiagonalArrays.jl
index 332b886757..60e85f778f 100644
--- a/NDTensors/test/test_DiagonalArrays.jl
+++ b/NDTensors/test/test_DiagonalArrays.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "DiagonalArrays", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_NamedDimsArrays.jl b/NDTensors/test/test_NamedDimsArrays.jl
index 229ca5138a..601e12ceb5 100644
--- a/NDTensors/test/test_NamedDimsArrays.jl
+++ b/NDTensors/test/test_NamedDimsArrays.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "NamedDimsArrays", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_SetParameters.jl b/NDTensors/test/test_SetParameters.jl
index 46ea9007e2..097b69f2e0 100644
--- a/NDTensors/test/test_SetParameters.jl
+++ b/NDTensors/test/test_SetParameters.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SetParameters", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_SmallVectors.jl b/NDTensors/test/test_SmallVectors.jl
index 745ac80d25..eb7d567493 100644
--- a/NDTensors/test/test_SmallVectors.jl
+++ b/NDTensors/test/test_SmallVectors.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SmallVectors", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_SortedSets.jl b/NDTensors/test/test_SortedSets.jl
index a3008de963..b7d2b2b37b 100644
--- a/NDTensors/test/test_SortedSets.jl
+++ b/NDTensors/test/test_SortedSets.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "SortedSets", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_TagSets.jl b/NDTensors/test/test_TagSets.jl
index e9f81c13d1..fdb1ab48e6 100644
--- a/NDTensors/test/test_TagSets.jl
+++ b/NDTensors/test/test_TagSets.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "TagSets", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_TensorAlgebra.jl b/NDTensors/test/test_TensorAlgebra.jl
index d916c75f2a..f0f4df2bd2 100644
--- a/NDTensors/test/test_TensorAlgebra.jl
+++ b/NDTensors/test/test_TensorAlgebra.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "TensorAlgebra", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_Unwrap.jl b/NDTensors/test/test_Unwrap.jl
index ebb82ff242..cbade1b7b3 100644
--- a/NDTensors/test/test_Unwrap.jl
+++ b/NDTensors/test/test_Unwrap.jl
@@ -2,4 +2,4 @@
 using NDTensors
 
 include(joinpath(pkgdir(NDTensors), "src", "Unwrap", "test", "runtests.jl"))
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 3c1ecebb34..3cbbc5bf67 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -1,299 +1,305 @@
 @eval module $(gensym())
-  using NDTensors
-  using LinearAlgebra: exp, Hermitian, svd
-  using Test: @testset, @test, @test_throws
-  using GPUArraysCore: @allowscalar
-  include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-  using .NDTensorsTestUtils: NDTensorsTestUtils
-
-  @testset "BlockSparseTensor basic functionality" begin
-    C = nothing
-
-    @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float32, Float64]
-      if dev == NDTensors.mtl && elt == Float64
-        continue;
-      end
-      # Indices
-      indsA = ([2, 3], [4, 5])
-
-      # Locations of non-zero blocks
-      locs = [(1, 2), (2, 1)]
-
-      A = dev(BlockSparseTensor{elt}(locs, indsA...))
-      randn!(A)
+using NDTensors
+using LinearAlgebra: exp, Hermitian, svd
+using Test: @testset, @test, @test_throws
+using GPUArraysCore: @allowscalar
+include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
-      @test blockdims(A, (1, 2)) == (2, 5)
-      @test blockdims(A, (2, 1)) == (3, 4)
-      @test nnzblocks(A) == 2
-      @test nnz(A) == 2 * 5 + 3 * 4
-      @test inds(A) == ([2, 3], [4, 5])
-      @test isblocknz(A, (2, 1))
-      @test isblocknz(A, (1, 2))
-      @test !isblocknz(A, (1, 1))
-      @test !isblocknz(A, (2, 2))
-
-      # Test different ways of getting nnz
-      @test nnz(blockoffsets(A), inds(A)) == nnz(A)
-
-      B = 2 * A
-      @test B[1, 1] == 2 * A[1, 1]
-      @test nnz(A) == 2 * 5 + 3 * 4
-      @test nnz(B) == 2 * 5 + 3 * 4
-      @test nnzblocks(A) == 2
-      @test nnzblocks(B) == 2
-
-      B = A / 2
-      @test B[1, 1] == A[1, 1] / 2
-      @test nnz(A) == 2 * 5 + 3 * 4
-      @test nnz(B) == 2 * 5 + 3 * 4
-      @test nnzblocks(A) == 2
-      @test nnzblocks(B) == 2
-
-      @allowscalar begin
-        A[1, 5] = 15
-        A[2, 5] = 25
-
-        @test A[1, 1] == 0
-        @test A[1, 5] == 15
-        @test A[2, 5] == 25
-      end
-      D = dense(A)
-
-      @allowscalar begin
-        @test D == A
+@testset "BlockSparseTensor basic functionality" begin
+  C = nothing
 
-        for I in eachindex(A)
-          @test D[I] == A[I]
-        end
-      end
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in [Float32, Float64]
 
-      A12 = blockview(A, (1, 2))
+    if dev == NDTensors.mtl && elt == Float64
+      continue
+    end
+    # Indices
+    indsA = ([2, 3], [4, 5])
+
+    # Locations of non-zero blocks
+    locs = [(1, 2), (2, 1)]
+
+    A = dev(BlockSparseTensor{elt}(locs, indsA...))
+    randn!(A)
+
+    @test blockdims(A, (1, 2)) == (2, 5)
+    @test blockdims(A, (2, 1)) == (3, 4)
+    @test nnzblocks(A) == 2
+    @test nnz(A) == 2 * 5 + 3 * 4
+    @test inds(A) == ([2, 3], [4, 5])
+    @test isblocknz(A, (2, 1))
+    @test isblocknz(A, (1, 2))
+    @test !isblocknz(A, (1, 1))
+    @test !isblocknz(A, (2, 2))
+
+    # Test different ways of getting nnz
+    @test nnz(blockoffsets(A), inds(A)) == nnz(A)
+
+    B = 2 * A
+    @test B[1, 1] == 2 * A[1, 1]
+    @test nnz(A) == 2 * 5 + 3 * 4
+    @test nnz(B) == 2 * 5 + 3 * 4
+    @test nnzblocks(A) == 2
+    @test nnzblocks(B) == 2
+
+    B = A / 2
+    @test B[1, 1] == A[1, 1] / 2
+    @test nnz(A) == 2 * 5 + 3 * 4
+    @test nnz(B) == 2 * 5 + 3 * 4
+    @test nnzblocks(A) == 2
+    @test nnzblocks(B) == 2
+
+    @allowscalar begin
+      A[1, 5] = 15
+      A[2, 5] = 25
+
+      @test A[1, 1] == 0
+      @test A[1, 5] == 15
+      @test A[2, 5] == 25
+    end
+    D = dense(A)
 
-      @test dims(A12) == (2, 5)
+    @allowscalar begin
+      @test D == A
 
-      @allowscalar for I in eachindex(A12)
-        @test A12[I] == A[I + CartesianIndex(0, 4)]
+      for I in eachindex(A)
+        @test D[I] == A[I]
       end
+    end
 
-      B = dev(BlockSparseTensor(undef, locs, indsA))
-      randn!(B)
+    A12 = blockview(A, (1, 2))
 
-      C = A + B
+    @test dims(A12) == (2, 5)
 
-      @allowscalar for I in eachindex(C)
-        @test C[I] == A[I] + B[I]
-      end
+    @allowscalar for I in eachindex(A12)
+      @test A12[I] == A[I + CartesianIndex(0, 4)]
+    end
 
-      Ap = permutedims(A, (2, 1))
+    B = dev(BlockSparseTensor(undef, locs, indsA))
+    randn!(B)
 
-      @test blockdims(Ap, (1, 2)) == (4, 3)
-      @test blockdims(Ap, (2, 1)) == (5, 2)
-      @test nnz(A) == nnz(Ap)
-      @test nnzblocks(A) == nnzblocks(Ap)
+    C = A + B
 
-      @allowscalar for I in eachindex(C)
-        @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
-      end
+    @allowscalar for I in eachindex(C)
+      @test C[I] == A[I] + B[I]
+    end
 
-      A = dev(BlockSparseTensor(complex(elt), locs, indsA))
-      randn!(A)
-      @test conj(data(store(A))) == data(store(conj(A)))
-      @test typeof(conj(A)) <: BlockSparseTensor
-
-      @testset "Random constructor" for elt in [Float32, Float64]
-        T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-        @test nnzblocks(T) == 2
-        @test nnz(T) == 8
-        @test eltype(T) == elt
-        @test norm(T) ≉ 0
-
-        Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-        @test nnzblocks(Tc) == 2
-        @test nnz(Tc) == 8
-        @test eltype(Tc) == complex(elt)
-        @test norm(Tc) ≉ 0
-      end
+    Ap = permutedims(A, (2, 1))
 
-      @testset "Complex Valued Operations" for elt in [Float32, Float64]
-        T = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-        rT = real(T)
-        @test eltype(rT) == elt
-        @test nnzblocks(rT) == nnzblocks(T)
-        iT = imag(T)
-        @test eltype(iT) == elt
-        @test nnzblocks(iT) == nnzblocks(T)
-        @test norm(rT)^2 + norm(iT)^2 ≈ norm(T)^2
-
-        cT = conj(T)
-        @test eltype(cT) == complex(elt)
-        @test nnzblocks(cT) == nnzblocks(T)
-      end
+    @test blockdims(Ap, (1, 2)) == (4, 3)
+    @test blockdims(Ap, (2, 1)) == (5, 2)
+    @test nnz(A) == nnz(Ap)
+    @test nnzblocks(A) == nnzblocks(Ap)
 
-      @testset "similartype regression test" for elt in [Float32, Float64]
-        # Regression test for issue seen in:
-        # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
-        # Previously, `similartype` wasn't using information about the dimensions
-        # properly and was returning a `BlockSparse` storage of the dimensions
-        # of the input tensor.
-        T = dev(BlockSparseTensor(elt, [(1, 1)], ([2], [2])))
-        @test NDTensors.ndims(
-          NDTensors.storagetype(NDTensors.similartype(typeof(T), ([2], [2], [2])))
-        ) == 3
-      end
+    @allowscalar for I in eachindex(C)
+      @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
+    end
 
-      @testset "Random constructor" for elt in [Float32, Float64]
-        T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-        @test nnzblocks(T) == 2
-        @test nnz(T) == 8
-        @test eltype(T) == elt
-        @test norm(T) ≉ 0
-
-        Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
-        @test nnzblocks(Tc) == 2
-        @test nnz(Tc) == 8
-        @test eltype(Tc) == complex(elt)
-        @test norm(Tc) ≉ 0
-      end
+    A = dev(BlockSparseTensor(complex(elt), locs, indsA))
+    randn!(A)
+    @test conj(data(store(A))) == data(store(conj(A)))
+    @test typeof(conj(A)) <: BlockSparseTensor
 
-      @testset "permute_combine" for elt in [Float32, Float64]
-        indsA = ([2, 3], [4, 5], [6, 7, 8])
-        locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
-        A = dev(BlockSparseTensor{elt}(locsA, indsA...))
-        randn!(A)
+    @testset "Random constructor" for elt in [Float32, Float64]
+      T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      @test nnzblocks(T) == 2
+      @test nnz(T) == 8
+      @test eltype(T) == elt
+      @test norm(T) ≉ 0
+
+      Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      @test nnzblocks(Tc) == 2
+      @test nnz(Tc) == 8
+      @test eltype(Tc) == complex(elt)
+      @test norm(Tc) ≉ 0
+    end
 
-        B = NDTensors.permute_combine(A, 3, (2, 1))
-        @test nnzblocks(A) == nnzblocks(B)
-        @test nnz(A) == nnz(B)
+    @testset "Complex Valued Operations" for elt in [Float32, Float64]
+      T = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      rT = real(T)
+      @test eltype(rT) == elt
+      @test nnzblocks(rT) == nnzblocks(T)
+      iT = imag(T)
+      @test eltype(iT) == elt
+      @test nnzblocks(iT) == nnzblocks(T)
+      @test norm(rT)^2 + norm(iT)^2 ≈ norm(T)^2
+
+      cT = conj(T)
+      @test eltype(cT) == complex(elt)
+      @test nnzblocks(cT) == nnzblocks(T)
+    end
 
-        Ap = NDTensors.permutedims(A, (3, 2, 1))
+    @testset "similartype regression test" for elt in [Float32, Float64]
+      # Regression test for issue seen in:
+      # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
+      # Previously, `similartype` wasn't using information about the dimensions
+      # properly and was returning a `BlockSparse` storage of the dimensions
+      # of the input tensor.
+      T = dev(BlockSparseTensor(elt, [(1, 1)], ([2], [2])))
+      @test NDTensors.ndims(
+        NDTensors.storagetype(NDTensors.similartype(typeof(T), ([2], [2], [2])))
+      ) == 3
+    end
 
-        @allowscalar for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
-          blockAp = blockview(Ap, bAp)
-          blockB = blockview(B, bB)
-          @test reshape(blockAp, size(blockB)) == blockB
-        end
-      end
+    @testset "Random constructor" for elt in [Float32, Float64]
+      T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      @test nnzblocks(T) == 2
+      @test nnz(T) == 8
+      @test eltype(T) == elt
+      @test norm(T) ≉ 0
+
+      Tc = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
+      @test nnzblocks(Tc) == 2
+      @test nnz(Tc) == 8
+      @test eltype(Tc) == complex(elt)
+      @test norm(Tc) ≉ 0
     end
 
-    @testset "BlockSparseTensor setindex! add block" begin
-      T = BlockSparseTensor([2, 3], [4, 5])
+    @testset "permute_combine" for elt in [Float32, Float64]
+      indsA = ([2, 3], [4, 5], [6, 7, 8])
+      locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
+      A = dev(BlockSparseTensor{elt}(locsA, indsA...))
+      randn!(A)
 
-      @allowscalar for I in eachindex(T)
-        @test T[I] == 0.0
-      end
-      @test nnz(T) == 0
-      @test nnzblocks(T) == 0
-      @test !isblocknz(T, (1, 1))
-      @test !isblocknz(T, (2, 1))
-      @test !isblocknz(T, (1, 2))
-      @test !isblocknz(T, (2, 2))
+      B = NDTensors.permute_combine(A, 3, (2, 1))
+      @test nnzblocks(A) == nnzblocks(B)
+      @test nnz(A) == nnz(B)
 
-      T[1, 1] = 1.0
+      Ap = NDTensors.permutedims(A, (3, 2, 1))
 
-      @test T[1, 1] == 1.0
-      @test nnz(T) == 8
-      @test nnzblocks(T) == 1
-      @test isblocknz(T, (1, 1))
-      @test !isblocknz(T, (2, 1))
-      @test !isblocknz(T, (1, 2))
-      @test !isblocknz(T, (2, 2))
+      @allowscalar for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
+        blockAp = blockview(Ap, bAp)
+        blockB = blockview(B, bB)
+        @test reshape(blockAp, size(blockB)) == blockB
+      end
+    end
+  end
 
-      T[4, 8] = 2.0
+  @testset "BlockSparseTensor setindex! add block" begin
+    T = BlockSparseTensor([2, 3], [4, 5])
 
-      @test T[4, 8] == 2.0
-      @test nnz(T) == 8 + 15
-      @test nnzblocks(T) == 2
-      @test isblocknz(T, (1, 1))
-      @test !isblocknz(T, (2, 1))
-      @test !isblocknz(T, (1, 2))
-      @test isblocknz(T, (2, 2))
-
-      T[1, 6] = 3.0
-
-      @test T[1, 6] == 3.0
-      @test nnz(T) == 8 + 15 + 10
-      @test nnzblocks(T) == 3
-      @test isblocknz(T, (1, 1))
-      @test !isblocknz(T, (2, 1))
-      @test isblocknz(T, (1, 2))
-      @test isblocknz(T, (2, 2))
-
-      T[4, 2] = 4.0
-
-      @test T[4, 2] == 4.0
-      @test nnz(T) == 8 + 15 + 10 + 12
-      @test nnzblocks(T) == 4
-      @test isblocknz(T, (1, 1))
-      @test isblocknz(T, (2, 1))
-      @test isblocknz(T, (1, 2))
-      @test isblocknz(T, (2, 2))
+    @allowscalar for I in eachindex(T)
+      @test T[I] == 0.0
     end
+    @test nnz(T) == 0
+    @test nnzblocks(T) == 0
+    @test !isblocknz(T, (1, 1))
+    @test !isblocknz(T, (2, 1))
+    @test !isblocknz(T, (1, 2))
+    @test !isblocknz(T, (2, 2))
+
+    T[1, 1] = 1.0
+
+    @test T[1, 1] == 1.0
+    @test nnz(T) == 8
+    @test nnzblocks(T) == 1
+    @test isblocknz(T, (1, 1))
+    @test !isblocknz(T, (2, 1))
+    @test !isblocknz(T, (1, 2))
+    @test !isblocknz(T, (2, 2))
+
+    T[4, 8] = 2.0
+
+    @test T[4, 8] == 2.0
+    @test nnz(T) == 8 + 15
+    @test nnzblocks(T) == 2
+    @test isblocknz(T, (1, 1))
+    @test !isblocknz(T, (2, 1))
+    @test !isblocknz(T, (1, 2))
+    @test isblocknz(T, (2, 2))
+
+    T[1, 6] = 3.0
+
+    @test T[1, 6] == 3.0
+    @test nnz(T) == 8 + 15 + 10
+    @test nnzblocks(T) == 3
+    @test isblocknz(T, (1, 1))
+    @test !isblocknz(T, (2, 1))
+    @test isblocknz(T, (1, 2))
+    @test isblocknz(T, (2, 2))
+
+    T[4, 2] = 4.0
+
+    @test T[4, 2] == 4.0
+    @test nnz(T) == 8 + 15 + 10 + 12
+    @test nnzblocks(T) == 4
+    @test isblocknz(T, (1, 1))
+    @test isblocknz(T, (2, 1))
+    @test isblocknz(T, (1, 2))
+    @test isblocknz(T, (2, 2))
+  end
 
-    @testset "svd on $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float32, Float64]
-      if dev == NDTensors.mtl && elt == Float64
-        continue;
-      end
-      @testset "svd example 1" begin
-        A = dev(BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2]))
-        randn!(A)
-        U, S, V = svd(A)
-        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-        atol = NDTensorsTestUtils.default_rtol(elt)
-      end
-
-      @testset "svd example 2" begin
-        A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
-        randn!(A)
-        U, S, V = svd(A)
-        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-        atol = NDTensorsTestUtils.default_rtol(elt)
-      end
+  @testset "svd on $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in [Float32, Float64]
 
-      @testset "svd example 3" begin
-        A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
-        randn!(A)
-        U, S, V = svd(A)
-        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-        atol = NDTensorsTestUtils.default_rtol(elt)
-      end
+    if dev == NDTensors.mtl && elt == Float64
+      continue
+    end
+    @testset "svd example 1" begin
+      A = dev(BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2]))
+      randn!(A)
+      U, S, V = svd(A)
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = NDTensorsTestUtils.default_rtol(elt)
+    end
 
-      @testset "svd example 4" begin
-        A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
-        randn!(A)
-        U, S, V = svd(A)
-        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-        atol = NDTensorsTestUtils.default_rtol(elt)
-      end
+    @testset "svd example 2" begin
+      A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
+      randn!(A)
+      U, S, V = svd(A)
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = NDTensorsTestUtils.default_rtol(elt)
+    end
 
-      @testset "svd example 5" begin
-        A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
-        randn!(A)
-        U, S, V = svd(A)
-        @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-        atol = NDTensorsTestUtils.default_rtol(elt)
-      end
+    @testset "svd example 3" begin
+      A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
+      randn!(A)
+      U, S, V = svd(A)
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = NDTensorsTestUtils.default_rtol(elt)
     end
 
-    @testset "exp" for elt in [Float32, Float64]
-      A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 4], [2, 4])
+    @testset "svd example 4" begin
+      A = dev(BlockSparseTensor{elt}([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
       randn!(A)
-      expT = exp(A)
-      @test array(expT) ≈ exp(array(A))
+      U, S, V = svd(A)
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
       atol = NDTensorsTestUtils.default_rtol(elt)
+    end
 
-      # Hermitian case
-      A = BlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+    @testset "svd example 5" begin
+      A = dev(BlockSparseTensor{elt}([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
       randn!(A)
-      Ah = BlockSparseTensor(complex(elt), undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
-      for bA in eachnzblock(A)
-        b = blockview(A, bA)
-        blockview(Ah, bA) .= b + b'
-      end
-      expTh = exp(Hermitian(Ah))
-      @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = NDTensorsTestUtils.default_rtol(eltype(Ah))
+      U, S, V = svd(A)
+      @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
+      atol = NDTensorsTestUtils.default_rtol(elt)
+    end
+  end
 
-      A = BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2])
-      @test_throws ErrorException exp(A)
+  @testset "exp" for elt in [Float32, Float64]
+    A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 4], [2, 4])
+    randn!(A)
+    expT = exp(A)
+    @test array(expT) ≈ exp(array(A))
+    atol = NDTensorsTestUtils.default_rtol(elt)
+
+    # Hermitian case
+    A = BlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+    randn!(A)
+    Ah = BlockSparseTensor(complex(elt), undef, [(1, 1), (2, 2)], ([2, 2], [2, 2]))
+    for bA in eachnzblock(A)
+      b = blockview(A, bA)
+      blockview(Ah, bA) .= b + b'
     end
+    expTh = exp(Hermitian(Ah))
+    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = NDTensorsTestUtils.default_rtol(
+      eltype(Ah)
+    )
+
+    A = BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2])
+    @test_throws ErrorException exp(A)
   end
 end
+end
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index f59333d94f..26937f8015 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -86,4 +86,4 @@ using ITensors: QN, Index
     end
   end
 end
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index 105d75d03a..a0729d6768 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -276,4 +276,4 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 end
 
 nothing
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_diag.jl b/NDTensors/test/test_diag.jl
index 849822e364..ca36531886 100644
--- a/NDTensors/test/test_diag.jl
+++ b/NDTensors/test/test_diag.jl
@@ -65,4 +65,4 @@ end
   @test contract(A, (-2, 1), t, (-2, 3)) == transpose(A)
 end
 nothing
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_diagblocksparse.jl b/NDTensors/test/test_diagblocksparse.jl
index 070641b6c1..f47f4b5da0 100644
--- a/NDTensors/test/test_diagblocksparse.jl
+++ b/NDTensors/test/test_diagblocksparse.jl
@@ -25,4 +25,4 @@ using Test: @testset, @test
   @test conj(NeverAlias(), tensor)[1, 1] == conj(c)
   @test conj(AllowAlias(), tensor)[1, 1] == conj(c)
 end
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_emptynumber.jl b/NDTensors/test/test_emptynumber.jl
index a1806e6a39..dc8357a115 100644
--- a/NDTensors/test/test_emptynumber.jl
+++ b/NDTensors/test/test_emptynumber.jl
@@ -29,4 +29,4 @@ const 𝟎 = NDTensors.EmptyNumber()
   @test norm(𝟎) == 0.0
   @test norm(𝟎) isa Float64
 end
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_emptystorage.jl b/NDTensors/test/test_emptystorage.jl
index 272f203632..f4db78d409 100644
--- a/NDTensors/test/test_emptystorage.jl
+++ b/NDTensors/test/test_emptystorage.jl
@@ -32,4 +32,4 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     @test zero(T) isa typeof(T)
   end
 end
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index 94b64400f4..eae7b5e6b1 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -89,4 +89,4 @@ devs = devices_list(copy(ARGS))
 end
 
 nothing
-end
\ No newline at end of file
+end
diff --git a/NDTensors/test/test_tupletools.jl b/NDTensors/test/test_tupletools.jl
index 3e9547f757..3af9a1e5bc 100644
--- a/NDTensors/test/test_tupletools.jl
+++ b/NDTensors/test/test_tupletools.jl
@@ -8,4 +8,4 @@ using NDTensors: NDTensors
 end
 
 nothing
-end
\ No newline at end of file
+end

From bb424d7ca966ce72f516076984769cd348075db4 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 12:06:18 -0500
Subject: [PATCH 20/73] Create a couple necessary BlockSparseTensor
 constructors

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index f78cc16d20..ccd3dbe53c 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -48,6 +48,9 @@ end
 Construct a block sparse tensor with uninitialized memory
 from indices and locations of non-zero blocks.
 """
+function BlockSparseTensor{ElT}(::UndefInitializer, blocksoffsets, inds) where {ElT<:Number}
+  return BlockSparseTensor(ElT, undef, blockoffsets, inds)
+end
 function BlockSparseTensor(::UndefInitializer, blockoffsets, inds)
   return BlockSparseTensor(NDTensors.default_eltype(), undef, blockoffsets, inds)
 end
@@ -170,6 +173,12 @@ BlockSparseTensor(blocks::Vector{Block{N}},
 Construct a block sparse tensor with the specified blocks.
 Defaults to setting structurally non-zero blocks to zero.
 """
+function BlockSparseTensor{ElT}(
+  blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
+) where {ElT<:Number,BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}
+  return BlockSparseTensor(ElT, blocks, inds)
+end
+
 function BlockSparseTensor(
   blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
 ) where {BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}

From b393700c84c639e40fa98e4dc662c3574d2bf3af Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 12:13:21 -0500
Subject: [PATCH 21/73] fix dmrg tests

---
 NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl     | 3 +--
 NDTensors/test/ITensors/TestITensorDMRG/runtests.jl | 4 ++--
 NDTensors/test/test_linearalgebra.jl                | 5 ++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
index e9f3e26cfd..dd8117099e 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
@@ -1,5 +1,4 @@
 include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: default_rtol, is_supported_eltype, devices_list
 function test_dmrg(elt, N::Integer; dev::Function, conserve_qns)
   sites = siteinds("S=1/2", N; conserve_qns)
 
@@ -23,5 +22,5 @@ function test_dmrg(elt, N::Integer; dev::Function, conserve_qns)
   maxdim = 32
 
   energy, psi = dmrg(H, psi0; nsweeps, cutoff, maxdim, noise, outputlevel=0)
-  @test energy ≈ reference_energies[N] rtol = default_rtol(elt)
+  @test energy ≈ reference_energies[N] rtol = NDTensorsTestUtils.default_rtol(elt)
 end
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
index 3c451ba275..388908da98 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
@@ -2,10 +2,10 @@ using Test
 using NDTensors
 ## TODO headergaurd
 include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: default_rtol, is_supported_eltype, devices_list
+using .NDTensorsTestUtils: NDTensorsTestUtils
 include("TestITensorDMRG.jl")
 
-@testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in devices_list(ARGS),
+@testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in NDTensorsTestUtils.devices_list(ARGS),
   conserve_qns in [false, true],
   elt in (Float32, ComplexF32, Float64, ComplexF64),
   N in [4, 10]
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index eae7b5e6b1..1d34745dbe 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -4,7 +4,7 @@ using LinearAlgebra: Diagonal, qr, diag
 using Test: @testset, @test
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: default_rtol, devices_list
+using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "random_orthog" begin
   n, m = 10, 4
@@ -24,7 +24,6 @@ end
   @test norm(U2 * U2' - Diagonal(fill(1.0, m))) < 1E-14
 end
 
-devs = devices_list(copy(ARGS))
 @testset "QX testing" begin
   @testset "Dense $qx decomposition, elt=$elt, positve=$positive, singular=$singular, device=$dev" for qx in
                                                                                                        [
@@ -33,7 +32,7 @@ devs = devices_list(copy(ARGS))
     elt in [Float64, ComplexF64, Float32, ComplexF32],
     positive in [false, true],
     singular in [false, true],
-    dev in devs
+    dev in NDTensorsTestUtils.devices_list(copy(ARGS))
 
     ## Skip Float64 on Metal
     if dev == NDTensors.mtl && (elt == Float64 || elt == ComplexF64)

From 49b0f7b75fb11d54fc397334e344448cf7535e58 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 13:25:23 -0500
Subject: [PATCH 22/73] Fix metal tests

---
 .../src/blocksparse/blocksparsetensor.jl      |  3 --
 NDTensors/src/tensorstorage/tensorstorage.jl  |  2 +-
 NDTensors/test/test_blocksparse.jl            | 13 ++++-----
 NDTensors/test/test_dense.jl                  | 28 +++++++++++--------
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index ccd3dbe53c..98b0a70e95 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -48,9 +48,6 @@ end
 Construct a block sparse tensor with uninitialized memory
 from indices and locations of non-zero blocks.
 """
-function BlockSparseTensor{ElT}(::UndefInitializer, blocksoffsets, inds) where {ElT<:Number}
-  return BlockSparseTensor(ElT, undef, blockoffsets, inds)
-end
 function BlockSparseTensor(::UndefInitializer, blockoffsets, inds)
   return BlockSparseTensor(NDTensors.default_eltype(), undef, blockoffsets, inds)
 end
diff --git a/NDTensors/src/tensorstorage/tensorstorage.jl b/NDTensors/src/tensorstorage/tensorstorage.jl
index 7ddb20777e..71a6f57678 100644
--- a/NDTensors/src/tensorstorage/tensorstorage.jl
+++ b/NDTensors/src/tensorstorage/tensorstorage.jl
@@ -29,7 +29,7 @@ Base.@propagate_inbounds function Base.setindex!(S::TensorStorage, v, i::Integer
 end
 
 ## Missing a check or conversion when calling number * Tensor. This causes Metal to fail numerically because it tries to convert it to Float64. Preserve S eltype. ## TODO this could probably be handled differently/better?
-(S::TensorStorage * x::Number) = setdata(S, eltype(S)(x) * data(S))
+(S::TensorStorage * x::Number) = setdata(S, x * data(S))
 (x::Number * S::TensorStorage) = S * x
 (S::TensorStorage / x::Number) = setdata(S, data(S) / x)
 
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 3cbbc5bf67..125d832397 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -11,7 +11,6 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 
   @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
     elt in [Float32, Float64]
-
     if dev == NDTensors.mtl && elt == Float64
       continue
     end
@@ -77,7 +76,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       @test A12[I] == A[I + CartesianIndex(0, 4)]
     end
 
-    B = dev(BlockSparseTensor(undef, locs, indsA))
+    B = dev(BlockSparseTensor(elt, undef, locs, indsA))
     randn!(B)
 
     C = A + B
@@ -102,7 +101,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     @test conj(data(store(A))) == data(store(conj(A)))
     @test typeof(conj(A)) <: BlockSparseTensor
 
-    @testset "Random constructor" for elt in [Float32, Float64]
+    @testset "Random constructor" begin
       T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
       @test nnzblocks(T) == 2
       @test nnz(T) == 8
@@ -116,7 +115,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       @test norm(Tc) ≉ 0
     end
 
-    @testset "Complex Valued Operations" for elt in [Float32, Float64]
+    @testset "Complex Valued Operations" begin
       T = dev(randomBlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2])))
       rT = real(T)
       @test eltype(rT) == elt
@@ -131,7 +130,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       @test nnzblocks(cT) == nnzblocks(T)
     end
 
-    @testset "similartype regression test" for elt in [Float32, Float64]
+    @testset "similartype regression test" begin
       # Regression test for issue seen in:
       # https://github.com/ITensor/ITensorInfiniteMPS.jl/pull/77
       # Previously, `similartype` wasn't using information about the dimensions
@@ -143,7 +142,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       ) == 3
     end
 
-    @testset "Random constructor" for elt in [Float32, Float64]
+    @testset "Random constructor" begin
       T = dev(randomBlockSparseTensor(elt, [(1, 1), (2, 2)], ([2, 2], [2, 2])))
       @test nnzblocks(T) == 2
       @test nnz(T) == 8
@@ -157,7 +156,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       @test norm(Tc) ≉ 0
     end
 
-    @testset "permute_combine" for elt in [Float32, Float64]
+    @testset "permute_combine" begin
       indsA = ([2, 3], [4, 5], [6, 7, 8])
       locsA = [(2, 1, 1), (1, 2, 1), (2, 2, 3)]
       A = dev(BlockSparseTensor{elt}(locsA, indsA...))
diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index a0729d6768..bfe80baf28 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -1,12 +1,12 @@
 @eval module $(gensym())
 using NDTensors
-using Test: @testset, @test, @test_throws
+using Test: @testset, @test, @test_throws, @test_broken
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "Dense Tensors" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+ @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
     elt = dev == NDTensors.mtl ? Float32 : Float64
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
@@ -33,7 +33,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       Aview = A[2:3, 2:3]
       @test dims(Aview) == (2, 2)
 
-      B = dev(Tensor(undef, (3, 4)))
+      B = dev(Tensor(elt, undef, (3, 4)))
       randn!(B)
       C = copy(A)
       C = permutedims!!(C, B, (1, 2), +)
@@ -68,10 +68,16 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
         @test A[2, 2] == Aview[1, 1]
       end
 
-      ## Right now this treats the `Tensor` type as an abstract Array 
-      ## And calls getindex instead of CUDA.==. Can fix by converting to CPU or 
-      ## Just looking at the data
-      @test data(A * 2.0) == data(2.0 * A)
+      ## There is an issue in metal like this
+      ## julia> MtlVector{Float32}(undef, (10,)) .+ 2.0
+      ## ERROR: Metal does not support Float64 values, try using Float32 instead
+      ## This is a temporary fix while metal is broken
+      if dev == NDTensors.mtl
+        #@test data(A * elt(2.0)) == data(elt(2.0) * A)
+        @test_broken data(A * 2.0) == data(2.0 * A)
+      else
+        @test data(A * 2.0) == data(2.0 * A)
+      end
 
       Asim = similar(data(A), 10)
       @test eltype(Asim) == elt
@@ -116,8 +122,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       @test dim(I) == 1000
       @test Array(I) == I_arr
 
-      J = dev(Tensor((2, 2)))
-      K = dev(Tensor((2, 2)))
+      J = dev(Tensor(elt, (2, 2)))
+      K = dev(Tensor(elt, (2, 2)))
       @test Array(J * K) ≈ Array(J) * Array(K)
     end
 
@@ -213,7 +219,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
         @test @allowscalar any(isnan, R)
-        T1 = dev(randomTensor((2, 2, 1)))
+        T1 = dev(randomTensor(elt, (2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (1, 2, 3), T1, (1, 2, -1), T2, (-1, 1))
         @test @allowscalar !any(isnan, R)
@@ -224,7 +230,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
         @test @allowscalar any(isnan, R)
-        T1 = dev(randomTensor((2, 2, 1)))
+        T1 = dev(randomTensor(elt, (2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (2, 1, 3), T1, (1, 2, -1), T2, (-1, 1))
         @test @allowscalar !any(isnan, R)

From c50f4310fc0bfe5779a40fcf39e46b006e9899e5 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 13:25:40 -0500
Subject: [PATCH 23/73] format

---
 NDTensors/test/ITensors/TestITensorDMRG/runtests.jl | 5 ++++-
 NDTensors/test/test_blocksparse.jl                  | 1 +
 NDTensors/test/test_dense.jl                        | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
index 388908da98..79b09ddeb0 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
@@ -5,7 +5,10 @@ include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
 include("TestITensorDMRG.jl")
 
-@testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in NDTensorsTestUtils.devices_list(ARGS),
+@testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in
+                                                       NDTensorsTestUtils.devices_list(
+    ARGS
+  ),
   conserve_qns in [false, true],
   elt in (Float32, ComplexF32, Float64, ComplexF64),
   N in [4, 10]
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 125d832397..96a420d326 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -11,6 +11,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 
   @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
     elt in [Float32, Float64]
+
     if dev == NDTensors.mtl && elt == Float64
       continue
     end
diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index bfe80baf28..20b5cda860 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -6,7 +6,7 @@ include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "Dense Tensors" begin
- @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
     elt = dev == NDTensors.mtl ? Float32 : Float64
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin

From 06a9e4787b4882df6abb0dca8f9bf429740ff50d Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 13:36:24 -0500
Subject: [PATCH 24/73] This is fixed in this pr

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 6a9efe749f..73f424cf7f 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -61,7 +61,7 @@ function main()
   #Currently this code fails with CUDA.allowscalar(false)
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
-  @allowscalar grad = gradient(f, cA, cB, cC, cD)
+  grad = gradient(f, cA, cB, cC, cD)
   @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
   @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices

From d39f765e0850cadd1f0c838fb57b20c448f3cd40 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 13:36:43 -0500
Subject: [PATCH 25/73] fix elt issue in combiner

---
 NDTensors/test/test_combiner.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index 26937f8015..cdfd8faa0f 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -9,14 +9,17 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float64, Float32]
+    if dev == NDTensors.mtl && elt == Float64
+      continue
+    end
     @testset "Dense * Combiner" begin
       d = 2
       input_tensor_inds = (d, d, d)
       combiner_tensor_inds = (d^2, d, d)
       output_tensor_inds = (d, d^2)
 
-      input_tensor = dev(tensor(Dense(randn(input_tensor_inds)), input_tensor_inds))
+      input_tensor = dev(tensor(Dense(randn(elt, input_tensor_inds)), input_tensor_inds))
       combiner_tensor = dev(tensor(Combiner([1], [1]), combiner_tensor_inds))
 
       output_tensor = contract(input_tensor, (1, -1, -2), combiner_tensor, (2, -1, -2))
@@ -32,7 +35,7 @@ using ITensors: QN, Index
 
       # Catch invalid combining
       input_tensor_inds = (d,)
-      input_tensor = dev(tensor(Dense(randn(input_tensor_inds)), input_tensor_inds))
+      input_tensor = dev(tensor(Dense(randn(elt, input_tensor_inds)), input_tensor_inds))
       combiner_tensor = dev(tensor(Combiner([1], [1]), combiner_tensor_inds))
       @test_throws Any contract(input_tensor, (-1,), combiner_tensor, (1, -1, -2))
     end
@@ -51,7 +54,7 @@ using ITensors: QN, Index
       input_tensor = dev(
         tensor(
           BlockSparse(
-            randn(dim(input_tensor_inds)), BlockOffsets{3}([Block(1, 1, 1)], [0])
+            randn(elt, dim(input_tensor_inds)), BlockOffsets{3}([Block(1, 1, 1)], [0])
           ),
           input_tensor_inds,
         ),
@@ -76,7 +79,7 @@ using ITensors: QN, Index
       invalid_input_tensor = dev(
         tensor(
           BlockSparse(
-            randn(dim(invalid_input_tensor_inds)), BlockOffsets{1}([Block(1)], [0])
+            randn(elt, dim(invalid_input_tensor_inds)), BlockOffsets{1}([Block(1)], [0])
           ),
           invalid_input_tensor_inds,
         ),

From cb652b745a04d7764739cd408a9839fa1a01eb1b Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 13:38:02 -0500
Subject: [PATCH 26/73] format

---
 NDTensors/test/test_combiner.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index cdfd8faa0f..607ee7af21 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -9,7 +9,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)), elt in [Float64, Float32]
+  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in [Float64, Float32]
+
     if dev == NDTensors.mtl && elt == Float64
       continue
     end

From c4e7332192ab0a1400a1209962d32777aa8e96d2 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:22:46 -0500
Subject: [PATCH 27/73] use tuple when different types

---
 NDTensors/test/test_blocksparse.jl   | 10 +++++-----
 NDTensors/test/test_combiner.jl      |  4 ++--
 NDTensors/test/test_linearalgebra.jl |  2 +-
 test/base/test_decomp.jl             |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 96a420d326..4d1ead4635 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -9,8 +9,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
 
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
-    elt in [Float32, Float64]
+  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in (Float32, Float64)
 
     if dev == NDTensors.mtl && elt == Float64
       continue
@@ -231,8 +231,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     @test isblocknz(T, (2, 2))
   end
 
-  @testset "svd on $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
-    elt in [Float32, Float64]
+  @testset "svd on $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in (Float32, Float64)
 
     if dev == NDTensors.mtl && elt == Float64
       continue
@@ -278,7 +278,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     end
   end
 
-  @testset "exp" for elt in [Float32, Float64]
+  @testset "exp, eltype: $elt" for elt in (Float32, Float64)
     A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 4], [2, 4])
     randn!(A)
     expT = exp(A)
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index 607ee7af21..e81a54ac8b 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -9,8 +9,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
-    elt in [Float64, Float32]
+  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+    elt in (Float64, Float32)
 
     if dev == NDTensors.mtl && elt == Float64
       continue
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index 1d34745dbe..e6679ec883 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -29,7 +29,7 @@ end
                                                                                                        [
       qr, ql
     ],
-    elt in [Float64, ComplexF64, Float32, ComplexF32],
+    elt in (Float64, ComplexF64, Float32, ComplexF32),
     positive in [false, true],
     singular in [false, true],
     dev in NDTensorsTestUtils.devices_list(copy(ARGS))
diff --git a/test/base/test_decomp.jl b/test/base/test_decomp.jl
index 086d0aea40..a4c8fc1554 100644
--- a/test/base/test_decomp.jl
+++ b/test/base/test_decomp.jl
@@ -140,7 +140,7 @@ end
                                                                                                     [
       0, 1, 2, 3
     ],
-    elt in [Float64, ComplexF64]
+    elt in (Float64, ComplexF64)
 
     l = Index(5, "l")
     s = Index(2, "s")

From dc8b13eb2a5d8897bb707fecd1eafd31e9d0d0d2 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:26:49 -0500
Subject: [PATCH 28/73] Fix typo

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 98b0a70e95..521881c6e8 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -73,7 +73,7 @@ end
 
 Construct a block sparse tensor with no blocks.
 """
-BlockSparseTensor(inds) = BlockSparseTensor(FloNDTensors.default_eltype()at64, inds)
+BlockSparseTensor(inds) = BlockSparseTensor(NDTensors.default_eltype(), inds)
 
 function BlockSparseTensor(datatype::Type{<:AbstractArray}, inds)
   return BlockSparseTensor(datatype, BlockOffsets{length(inds)}(), inds)

From ef3b4d58dc8e4e6794b23758e52163898f99808b Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:39:07 -0500
Subject: [PATCH 29/73] Alphabetize and consistently use LinearAlgebra.

---
 NDTensors/src/Unwrap/test/runtests.jl | 35 +++++++++++++--------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index 6f2e3bc0fd..6df59edde7 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,8 +1,7 @@
 using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
 using NDTensors: NDTensors, mul!!
-using LinearAlgebra:
-  LinearAlgebra, Transpose, qr, Symmetric, eigen, Hermitian, Diagonal, svd, mul!
+using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, qr, svd, 
 using GPUArraysCore: @allowscalar
 include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
@@ -55,7 +54,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   copyto!(expose(mp), expose(ma))
   @test mp == ma
 
-  q, r = qr(expose(mp))
+  q, r = LinearAlgebra.qr(expose(mp))
   @test q * r ≈ mp
 
   q, r = Unwrap.qr_positive(expose(mp))
@@ -65,29 +64,29 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   square = (square + transpose(square)) / 2
   ## CUDA only supports Hermitian or Symmetric eigen decompositions
   ## So I symmetrize square and call symetric here
-  l, U = eigen(expose(Symmetric(square)))
+  l, U = LinearAlgebra.eigen(expose(LinearAlgebra.Symmetric(square)))
   @test eltype(l) == real(elt)
   @test eltype(U) == real(elt)
-  @test square * U ≈ U * Diagonal(l)
+  @test square * U ≈ U * LinearAlgebra.Diagonal(l)
 
   square = dev(rand(elt, (10, 10)))
   # Can use `hermitianpart` in Julia 1.10
   square = (square + square') / 2
   ## CUDA only supports Hermitian or Symmetric eigen decompositions
   ## So I symmetrize square and call symetric here
-  l, U = eigen(expose(Hermitian(square)))
+  l, U = LinearAlgebra.eigen(expose(LinearAlgebra.Hermitian(square)))
   @test eltype(l) == real(elt)
   @test eltype(U) == elt
-  @test square * U ≈ U * Diagonal(l)
+  @test square * U ≈ U * LinearAlgebra.Diagonal(l)
 
-  U, S, V, = svd(expose(mp))
+  U, S, V, = LinearAlgebra.svd(expose(mp))
   @test eltype(U) == elt
   @test eltype(S) == real(elt)
   @test eltype(V) == elt
-  @test U * Diagonal(S) * V' ≈ mp
+  @test U * LinearAlgebra.Diagonal(S) * V' ≈ mp
 
   cm = dev(randn(elt, 2, 2))
-  mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
+  LinearAlgebra.mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
   @test cm ≈ mp * mp'
 
   @test permutedims(expose(mp), (2, 1)) == transpose(mp)
@@ -155,10 +154,10 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 
   ## This fails with scalar indexing
   if dev != NDTensors.cpu
-    @test_broken mul!(transpose(C), transpose(A), B, true, false)
+    @test_broken LinearAlgebra.mul!(transpose(C), transpose(A), B, true, false)
   end
-  mul!(C, transpose(B), A, true, false)
-  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  LinearAlgebra.mul!(C, transpose(B), A, true, false)
+  LinearAlgebra.mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
   ## Try calling mul!! with transposes to verify that code works
@@ -168,10 +167,10 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   Cp = zero(C)
   ## This fails with scalar indexing 
   if dev != NDTensors.cpu
-    @test_broken mul!(C', A', B, true, false)
+    @test_broken LinearAlgebra.mul!(C', A', B, true, false)
   end
-  mul!(C, B', A, true, false)
-  mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  LinearAlgebra.mul!(C, B', A, true, false)
+  LinearAlgebra.mul!(expose(Cp'), expose(A'), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
   Cpt = NDTensors.mul!!(Cp', A', B, true, false)
@@ -183,9 +182,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
   B = dev(randn(elt, 2, 2))
   C = dev(zeros(elt, 2, 12))
-  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
+  LinearAlgebra..mul!(expose(C), expose(B), expose(A), true, false)
   Cp = NDTensors.cpu(similar(C))
-  NDTensors.mul!(
+  LinearAlgebra..mul!(
     expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
   )
   @test NDTensors.cpu(C) ≈ Cp

From b22c430a6380c434ba38efdbeadd880c952a5694 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:40:32 -0500
Subject: [PATCH 30/73] Remove commented code

---
 NDTensors/src/dense/tensoralgebra/outer.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
index b2143c77f7..d66d87a2a7 100644
--- a/NDTensors/src/dense/tensoralgebra/outer.jl
+++ b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -21,12 +21,6 @@ function outer!(
   v1 = data(T1)
   v2 = data(T2)
   RM = reshape(R, length(v1), length(v2))
-  ## Potential fix is call reshape on array
-  #RM = reshape(array(R), length(v1), length(v2))
-  #RM .= v1 .* transpose(v2)
-  #mul!(RM, v1, transpose(v2))
-  #ger!(one(ElR), zero(ElR), v1, v2)
-  #_gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), array(RM))
   ## There is no _gemm! defined for CUDA or Metal so it calls 
   ## generic matmul. Replace with mul!! to call correct mul!! (ger)
   mul!!(array(RM), v1, transpose(v2), one(ElR), zero(ElR))

From e6baedc338b44f2e481ee3457895efa1cbca3772 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:45:33 -0500
Subject: [PATCH 31/73] Remove comment

---
 NDTensors/src/tensorstorage/tensorstorage.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NDTensors/src/tensorstorage/tensorstorage.jl b/NDTensors/src/tensorstorage/tensorstorage.jl
index 71a6f57678..9708379757 100644
--- a/NDTensors/src/tensorstorage/tensorstorage.jl
+++ b/NDTensors/src/tensorstorage/tensorstorage.jl
@@ -28,7 +28,6 @@ Base.@propagate_inbounds function Base.setindex!(S::TensorStorage, v, i::Integer
   return (setindex!(data(S), v, i); S)
 end
 
-## Missing a check or conversion when calling number * Tensor. This causes Metal to fail numerically because it tries to convert it to Float64. Preserve S eltype. ## TODO this could probably be handled differently/better?
 (S::TensorStorage * x::Number) = setdata(S, x * data(S))
 (x::Number * S::TensorStorage) = S * x
 (S::TensorStorage / x::Number) = setdata(S, data(S) / x)

From 1473da7b2e585af6f9798beaade600aa83d98bea Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:45:45 -0500
Subject: [PATCH 32/73] remove  comma

---
 NDTensors/src/Unwrap/test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index 6df59edde7..2751d0ac3d 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,7 +1,7 @@
 using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
 using NDTensors: NDTensors, mul!!
-using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, qr, svd, 
+using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, qr, svd
 using GPUArraysCore: @allowscalar
 include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils

From 40be5aa586de8f95cd3b18a27f655af9b1116b7d Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 21 Nov 2023 15:47:38 -0500
Subject: [PATCH 33/73] Split into two lines

---
 NDTensors/ext/NDTensorsMetalExt/mul.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index b9e8667155..ad63d92656 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -34,6 +34,7 @@ function LinearAlgebra.mul!(
   α,
   β,
 )
-  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  B = copy(expose(parent(BM)))
+  mul!(CM, AM, expose(transpose(B)), α, β)
   return unexpose(CM)
 end

From fc7561f1219c8f0f1e1321ae8dfaacfbfd40dc81 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 08:31:38 -0500
Subject: [PATCH 34/73] Need to have module `NDTensors` in path

---
 NDTensors/test/runtests.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index deff1f516d..d273bcf90d 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -12,9 +12,11 @@ using SafeTestsets: @safetestset
     end
   end
   if "cuda" in ARGS || "all" in ARGS
+    using NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorCUDA.jl"))
   end
   if "metal" in ARGS || "all" in ARGS
+    using NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorMetal.jl"))
   end
 end

From 25f6d04a92a3b449ac977f314b67b3381a47f8dc Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 08:32:47 -0500
Subject: [PATCH 35/73] Reorder for documentation

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 521881c6e8..65f7966497 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -170,18 +170,18 @@ BlockSparseTensor(blocks::Vector{Block{N}},
 Construct a block sparse tensor with the specified blocks.
 Defaults to setting structurally non-zero blocks to zero.
 """
-function BlockSparseTensor{ElT}(
-  blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
-) where {ElT<:Number,BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}
-  return BlockSparseTensor(ElT, blocks, inds)
-end
-
 function BlockSparseTensor(
   blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
 ) where {BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}
   return BlockSparseTensor(blocks, inds)
 end
 
+function BlockSparseTensor{ElT}(
+  blocks::Vector{BlockT}, inds::Vararg{BlockDim,N}
+) where {ElT<:Number,BlockT<:Union{Block{N},NTuple{N,<:Integer}}} where {N}
+  return BlockSparseTensor(ElT, blocks, inds)
+end
+
 function zeros(
   tensor::BlockSparseTensor{ElT,N}, blockoffsets::BlockOffsets{N}, inds
 ) where {ElT,N}

From 0fbe7b879fdabe92f9a59359a1f68efd5443712e Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 08:39:28 -0500
Subject: [PATCH 36/73] Update dense test code

---
 NDTensors/test/test_dense.jl | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index 20b5cda860..d0872b0116 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -68,16 +68,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
         @test A[2, 2] == Aview[1, 1]
       end
 
-      ## There is an issue in metal like this
-      ## julia> MtlVector{Float32}(undef, (10,)) .+ 2.0
-      ## ERROR: Metal does not support Float64 values, try using Float32 instead
-      ## This is a temporary fix while metal is broken
-      if dev == NDTensors.mtl
-        #@test data(A * elt(2.0)) == data(elt(2.0) * A)
-        @test_broken data(A * 2.0) == data(2.0 * A)
-      else
-        @test data(A * 2.0) == data(2.0 * A)
-      end
+      ## add elt around 2.0 to preserve the eltype of A.
+      @test data(A * elt(2.0)) == data(elt(2.0) * A)
 
       Asim = similar(data(A), 10)
       @test eltype(Asim) == elt

From 488aefd27b5304a6c10a0579880899a6a186f065 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 08:41:51 -0500
Subject: [PATCH 37/73] Add better comment

---
 NDTensors/test/readwrite.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NDTensors/test/readwrite.jl b/NDTensors/test/readwrite.jl
index a79910eb14..50576d3aa5 100644
--- a/NDTensors/test/readwrite.jl
+++ b/NDTensors/test/readwrite.jl
@@ -1,4 +1,5 @@
-## TODO this file doesn't seem to work properly
+## TODO this file was not included in the previous testing
+## and appears to be out of date with current code.
 using NDTensors, Test
 using HDF5
 

From e7ed450d5dc2850ac20e51c88b714db6b70d0b33 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 09:07:04 -0500
Subject: [PATCH 38/73] Fix remaining issues

---
 NDTensors/src/Unwrap/test/runtests.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index 2751d0ac3d..bd36e4bad1 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,7 +1,7 @@
 using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
 using NDTensors: NDTensors, mul!!
-using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, qr, svd
+using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, norm, qr, svd
 using GPUArraysCore: @allowscalar
 include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
@@ -182,9 +182,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
   B = dev(randn(elt, 2, 2))
   C = dev(zeros(elt, 2, 12))
-  LinearAlgebra..mul!(expose(C), expose(B), expose(A), true, false)
+  LinearAlgebra.mul!(expose(C), expose(B), expose(A), true, false)
   Cp = NDTensors.cpu(similar(C))
-  LinearAlgebra..mul!(
+  LinearAlgebra.mul!(
     expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
   )
   @test NDTensors.cpu(C) ≈ Cp

From 87c50e35cffc336eddf6a55f91509dd81cf89520 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 09:08:26 -0500
Subject: [PATCH 39/73] format

---
 NDTensors/src/Unwrap/test/runtests.jl | 13 ++++++++++++-
 NDTensors/test/test_blocksparse.jl    |  7 +++++--
 NDTensors/test/test_combiner.jl       |  4 +++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index bd36e4bad1..8797feaf74 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -1,7 +1,18 @@
 using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
 using NDTensors: NDTensors, mul!!
-using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, Hermitian, Symmetric, Transpose, eigen , mul!, norm, qr, svd
+using LinearAlgebra:
+  LinearAlgebra,
+  Adjoint,
+  Diagonal,
+  Hermitian,
+  Symmetric,
+  Transpose,
+  eigen,
+  mul!,
+  norm,
+  qr,
+  svd
 using GPUArraysCore: @allowscalar
 include("../../../test/NDTensorsTestUtils/NDTensorsTestUtils.jl")
 using .NDTensorsTestUtils: NDTensorsTestUtils
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 4d1ead4635..146653a7f2 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -9,7 +9,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
 
-  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(
+      copy(ARGS)
+    ),
     elt in (Float32, Float64)
 
     if dev == NDTensors.mtl && elt == Float64
@@ -231,7 +233,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     @test isblocknz(T, (2, 2))
   end
 
-  @testset "svd on $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+  @testset "svd on $dev, eltype: $elt" for dev in
+                                           NDTensorsTestUtils.devices_list(copy(ARGS)),
     elt in (Float32, Float64)
 
     if dev == NDTensors.mtl && elt == Float64
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index e81a54ac8b..c489f8b9f7 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -9,7 +9,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(
+      copy(ARGS)
+    ),
     elt in (Float64, Float32)
 
     if dev == NDTensors.mtl && elt == Float64

From 48806559852792b0e82d77b5766387fd6bde2d31 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 22 Nov 2023 11:10:02 -0500
Subject: [PATCH 40/73] Metal testing now fully functional remove it from extra

---
 NDTensors/test/Project.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
index 46d2cf33bb..f5f331e7db 100644
--- a/NDTensors/test/Project.toml
+++ b/NDTensors/test/Project.toml
@@ -16,6 +16,4 @@ TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[extras]
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

From 16b9c7494cde952dc38121f57616035c3305e51c Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Sun, 26 Nov 2023 14:35:39 -0500
Subject: [PATCH 41/73] Move metal back to extras because it cannot be
 installed properly for unittesting

---
 NDTensors/test/Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
index 6ee347f081..5301386cba 100644
--- a/NDTensors/test/Project.toml
+++ b/NDTensors/test/Project.toml
@@ -17,4 +17,6 @@ TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[extras]
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

From 7e4839aff0613b15062ff6a3517edbba78937ac5 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Sun, 26 Nov 2023 14:59:44 -0500
Subject: [PATCH 42/73] use is_supported_eltype function

---
 NDTensors/test/test_blocksparse.jl   | 2 +-
 NDTensors/test/test_combiner.jl      | 2 +-
 NDTensors/test/test_linearalgebra.jl | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 146653a7f2..4a8e2f6aee 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -14,7 +14,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     ),
     elt in (Float32, Float64)
 
-    if dev == NDTensors.mtl && elt == Float64
+    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
       continue
     end
     # Indices
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index c489f8b9f7..001e86a1aa 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -14,7 +14,7 @@ using ITensors: QN, Index
     ),
     elt in (Float64, Float32)
 
-    if dev == NDTensors.mtl && elt == Float64
+    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
       continue
     end
     @testset "Dense * Combiner" begin
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index e6679ec883..e9e3332eb7 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -35,7 +35,7 @@ end
     dev in NDTensorsTestUtils.devices_list(copy(ARGS))
 
     ## Skip Float64 on Metal
-    if dev == NDTensors.mtl && (elt == Float64 || elt == ComplexF64)
+    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
       continue
     end
     eps = Base.eps(real(elt)) * 100 #this is set rather tight, so if you increase/change m,n you may have open up the tolerance on eps.

From 2ef4f1698ccfede26e194ecf6404a9593922482d Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Mon, 27 Nov 2023 10:04:07 -0500
Subject: [PATCH 43/73] Update NDTensorsCUDAExt example

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 47 +++++++++++++-------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 73f424cf7f..27a2f542f4 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -1,10 +1,10 @@
-using CUDA
 using NDTensors
-
-using ITensors
-using Test
-
-using Zygote
+using CUDA: cu, CUDA, CuVector
+using ITensors:
+  Index, inner, ITensor, orthogonalize, qr, randomMPO, randomMPS, siteinds, storage, svd
+using Test: @test
+using Zygote: gradient
+using GPUArraysCore: @allowscalar
 
 function main()
   # using ITensorGPU
@@ -22,6 +22,7 @@ function main()
   B = ITensor(NDTensors.generic_randn(CuVector, dim(dim2)), dim2)
   # Contract the two tensors
   cpu = NDTensors.cpu
+  gpu = NDTensors.cu
   C = A * B
   A = cpu(A)
   B = cpu(B)
@@ -36,8 +37,8 @@ function main()
   fill!(B, randn())
 
   # Convert the ITensors to GPU
-  cA = NDTensors.cu(A)
-  cB = NDTensors.cu(B)
+  cA = gpu(A)
+  cB = gpu(B)
 
   #Check that backend of contraction is GPU
   @test A * A ≈ cpu(cA * cA)
@@ -50,7 +51,7 @@ function main()
   cC = ITensor(
     NDTensors.generic_randn(CuVector{Float64,CUDA.Mem.DeviceBuffer}, dim(dim3)), dim3
   )
-  cC = NDTensors.cu(ITensor(NDTensors.generic_randn(Vector{Float64}, dim(dim3)), dim3))
+  cC = gpu(ITensor(NDTensors.generic_randn(Vector{Float64}, dim(dim3)), dim3))
   cD = ITensor(Tensor(CuVector, dim4))
   fill!(cD, randn())
 
@@ -62,19 +63,16 @@ function main()
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
   grad = gradient(f, cA, cB, cC, cD)
-  @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
+  @allowscalar @test cpu(cB * cC * cD) ≈ cpu(grad[1])
   @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
-  decomp = (
-    dim(NDTensors.ind(grad[1], 1)),
-    dim(NDTensors.ind(grad[1], 2)) * dim(NDTensors.ind(grad[1], 3)),
-  )
+  decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
   # Reshape the CuVector of data into a matrix
-  cuTensor_data = CUDA.reshape(NDTensors.data(storage(grad[1])), decomp)
+  cuTensor_data = CUDA.reshape(data(storage(grad[1])), decomp)
   # Use cuBLAS to compute SVD of data
   U, S, V = svd(cuTensor_data)
-  decomp = (dim(NDTensors.ind(grad[2], 1)), dim(NDTensors.ind(grad[2], 2)))
-  cuTensor_data = CUDA.reshape(NDTensors.data(storage(grad[2])), decomp)
+  decomp = (dim(ind(grad[2], 1)), dim(ind(grad[2], 2)))
+  cuTensor_data = CUDA.reshape(data(storage(grad[2])), decomp)
   U, S, V = svd(cuTensor_data)
 
   # These things can take up lots of memory, look at memory usage here
@@ -87,8 +85,8 @@ function main()
   CUDA.memory_status()
 
   # Its possible to compute QR of GPU tensor
-  cq = ITensors.qr(cA, (i,), (j, l))
-  q = ITensors.qr(A, (i,), (j, l))
+  cq = qr(cA, (i,), (j, l))
+  q = qr(A, (i,), (j, l))
   A ≈ cpu(cq[1]) * cpu(cq[2])
 
   ## SVD does not yet work with CUDA backend, see above on
@@ -96,24 +94,25 @@ function main()
   ## CuVectors...
   #ITensors.svd(A, (i,), (j, l))
 
-  s = ITensors.siteinds("S=1/2", 8)
+  s = siteinds("S=1/2", 8)
   m = randomMPS(s; linkdims=4)
-  cm = NDTensors.cu(m)
+  cm = gpu(m)
 
   @test inner(cm', cm) ≈ inner(m', m)
 
   H = randomMPO(s)
-  cH = NDTensors.cu(H)
+  cH = gpu(H)
   @test inner(cm', cH, cm) ≈ inner(m', H, m)
 
   m = orthogonalize(m, 1)
-  cm = NDTensors.cu(orthogonalize(cm, 1))
+  cm = gpu(orthogonalize(cm, 1))
   @test inner(m', m) ≈ inner(cm', cm)
 
   H = orthogonalize(H, 1)
-  cH = NDTensors.cu(cH)
+  cH = gpu(cH)
 
   @test inner(cm', cH, cm) ≈ inner(m', H, m)
 end
 
+## running the main function with Float64
 main()

From e1748be1d87fa97d7f733dad924e18f31539a074 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 12:38:02 -0500
Subject: [PATCH 44/73] Remove LinearAlgebra.

---
 NDTensors/src/lib/Unwrap/test/runtests.jl | 42 +++++++++++------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index 3df09811d0..5b172e1606 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -16,7 +16,7 @@ using LinearAlgebra:
 using GPUArraysCore: @allowscalar
 include(
   joinpath(
-    pathof(NDTensors)[1:(end - 16)], "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"
+    pkgdir(NDTensors), "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"
   ),
 )
 using .NDTensorsTestUtils: NDTensorsTestUtils
@@ -34,8 +34,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   v_type = typeof(v)
   e_type = eltype(v)
   @test typeof(E) == Exposed{v_type,v_type}
-  @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
-  @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
+  @test typeof(Et) == Exposed{v_type,Transpose{e_type,v_type}}
+  @test typeof(Ea) == Exposed{v_type,Adjoint{e_type,v_type}}
 
   @test parent(E) == v
   @test parent(Et) == v
@@ -53,8 +53,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 
   m_type = typeof(m)
   @test typeof(E) == Exposed{m_type,m_type}
-  @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
-  @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
+  @test typeof(Et) == Exposed{m_type,Transpose{e_type,m_type}}
+  @test typeof(Ea) == Exposed{m_type,Adjoint{e_type,m_type}}
 
   o = dev(randn(elt, 1))
   expose(o)[] = 2
@@ -69,7 +69,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   copyto!(expose(mp), expose(ma))
   @test mp == ma
 
-  q, r = LinearAlgebra.qr(expose(mp))
+  q, r = qr(expose(mp))
   @test q * r ≈ mp
 
   q, r = Unwrap.qr_positive(expose(mp))
@@ -79,29 +79,29 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   square = (square + transpose(square)) / 2
   ## CUDA only supports Hermitian or Symmetric eigen decompositions
   ## So I symmetrize square and call symetric here
-  l, U = LinearAlgebra.eigen(expose(LinearAlgebra.Symmetric(square)))
+  l, U = eigen(expose(Symmetric(square)))
   @test eltype(l) == real(elt)
   @test eltype(U) == real(elt)
-  @test square * U ≈ U * LinearAlgebra.Diagonal(l)
+  @test square * U ≈ U * Diagonal(l)
 
   square = dev(rand(elt, (10, 10)))
   # Can use `hermitianpart` in Julia 1.10
   square = (square + square') / 2
   ## CUDA only supports Hermitian or Symmetric eigen decompositions
   ## So I symmetrize square and call symetric here
-  l, U = LinearAlgebra.eigen(expose(LinearAlgebra.Hermitian(square)))
+  l, U = eigen(expose(Hermitian(square)))
   @test eltype(l) == real(elt)
   @test eltype(U) == elt
-  @test square * U ≈ U * LinearAlgebra.Diagonal(l)
+  @test square * U ≈ U * Diagonal(l)
 
-  U, S, V, = LinearAlgebra.svd(expose(mp))
+  U, S, V, = svd(expose(mp))
   @test eltype(U) == elt
   @test eltype(S) == real(elt)
   @test eltype(V) == elt
-  @test U * LinearAlgebra.Diagonal(S) * V' ≈ mp
+  @test U * Diagonal(S) * V' ≈ mp
 
   cm = dev(randn(elt, 2, 2))
-  LinearAlgebra.mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
+  mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
   @test cm ≈ mp * mp'
 
   @test permutedims(expose(mp), (2, 1)) == transpose(mp)
@@ -169,10 +169,10 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
 
   ## This fails with scalar indexing
   if dev != NDTensors.cpu
-    @test_broken LinearAlgebra.mul!(transpose(C), transpose(A), B, true, false)
+    @test_broken mul!(transpose(C), transpose(A), B, true, false)
   end
-  LinearAlgebra.mul!(C, transpose(B), A, true, false)
-  LinearAlgebra.mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  mul!(C, transpose(B), A, true, false)
+  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
   ## Try calling mul!! with transposes to verify that code works
@@ -182,10 +182,10 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   Cp = zero(C)
   ## This fails with scalar indexing 
   if dev != NDTensors.cpu
-    @test_broken LinearAlgebra.mul!(C', A', B, true, false)
+    @test_broken mul!(C', A', B, true, false)
   end
-  LinearAlgebra.mul!(C, B', A, true, false)
-  LinearAlgebra.mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  mul!(C, B', A, true, false)
+  mul!(expose(Cp'), expose(A'), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
   Cpt = NDTensors.mul!!(Cp', A', B, true, false)
@@ -197,9 +197,9 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
   B = dev(randn(elt, 2, 2))
   C = dev(zeros(elt, 2, 12))
-  LinearAlgebra.mul!(expose(C), expose(B), expose(A), true, false)
+  mul!(expose(C), expose(B), expose(A), true, false)
   Cp = NDTensors.cpu(similar(C))
-  LinearAlgebra.mul!(
+  mul!(
     expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
   )
   @test NDTensors.cpu(C) ≈ Cp

From 37c0cedd41bffdbccfff5ee23ad1b7229ad163bc Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 13:14:18 -0500
Subject: [PATCH 45/73] Changes to NDTensorsCUDA example

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 27a2f542f4..3beef2f733 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -1,10 +1,9 @@
 using NDTensors
-using CUDA: cu, CUDA, CuVector
+using CUDA: CUDA, CuVector, @allowscalar, cu, reshape
 using ITensors:
-  Index, inner, ITensor, orthogonalize, qr, randomMPO, randomMPS, siteinds, storage, svd
+  Index, ITensor, randomMPO, randomMPS, inner, orthogonalize, qr, siteinds, svd
 using Test: @test
 using Zygote: gradient
-using GPUArraysCore: @allowscalar
 
 function main()
   # using ITensorGPU
@@ -68,11 +67,11 @@ function main()
   # Create a tuple of indices
   decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
   # Reshape the CuVector of data into a matrix
-  cuTensor_data = CUDA.reshape(data(storage(grad[1])), decomp)
+  cuTensor_data = CUDA.reshape(array(grad[1]), decomp)
   # Use cuBLAS to compute SVD of data
   U, S, V = svd(cuTensor_data)
   decomp = (dim(ind(grad[2], 1)), dim(ind(grad[2], 2)))
-  cuTensor_data = CUDA.reshape(data(storage(grad[2])), decomp)
+  cuTensor_data = CUDA.reshape(array(grad[2]), decomp)
   U, S, V = svd(cuTensor_data)
 
   # These things can take up lots of memory, look at memory usage here

From 663b56e77f9186ef7d0b22216a300fe75a135f93 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 13:15:31 -0500
Subject: [PATCH 46/73] Remove NDTensors.

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 65f7966497..435983039c 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -49,7 +49,7 @@ Construct a block sparse tensor with uninitialized memory
 from indices and locations of non-zero blocks.
 """
 function BlockSparseTensor(::UndefInitializer, blockoffsets, inds)
-  return BlockSparseTensor(NDTensors.default_eltype(), undef, blockoffsets, inds)
+  return BlockSparseTensor(default_eltype(), undef, blockoffsets, inds)
 end
 
 function BlockSparseTensor(
@@ -65,7 +65,7 @@ function BlockSparseTensor(eltype::Type{<:Number}, blockoffsets::BlockOffsets, i
 end
 
 function BlockSparseTensor(blockoffsets::BlockOffsets, inds)
-  return BlockSparseTensor(NDTensors.default_eltype(), blockoffsets, inds)
+  return BlockSparseTensor(default_eltype(), blockoffsets, inds)
 end
 
 """
@@ -73,7 +73,7 @@ end
 
 Construct a block sparse tensor with no blocks.
 """
-BlockSparseTensor(inds) = BlockSparseTensor(NDTensors.default_eltype(), inds)
+BlockSparseTensor(inds) = BlockSparseTensor(default_eltype(), inds)
 
 function BlockSparseTensor(datatype::Type{<:AbstractArray}, inds)
   return BlockSparseTensor(datatype, BlockOffsets{length(inds)}(), inds)
@@ -99,7 +99,7 @@ Construct a block sparse tensor with the specified blocks.
 Defaults to setting structurally non-zero blocks to zero.
 """
 function BlockSparseTensor(blocks::Vector{BlockT}, inds) where {BlockT<:Union{Block,NTuple}}
-  return BlockSparseTensor(NDTensors.default_eltype(), blocks, inds)
+  return BlockSparseTensor(default_eltype(), blocks, inds)
 end
 
 function BlockSparseTensor(
@@ -160,7 +160,7 @@ function randomBlockSparseTensor(blocks::Vector, inds)
 end
 
 function randomBlockSparseTensor(rng::AbstractRNG, blocks::Vector, inds)
-  return randomBlockSparseTensor(rng, NDTensors.default_eltype(), blocks, inds)
+  return randomBlockSparseTensor(rng, default_eltype(), blocks, inds)
 end
 
 """

From 8de246b7db6a7074b67efb624390a5bfee54014b Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 18:53:33 -0500
Subject: [PATCH 47/73] Update NDTensors/test/runtests.jl [no-ci]

Co-authored-by: Matt Fishman <mtfishman@users.noreply.github.com>
---
 NDTensors/test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index adbbf23382..d9a1f06608 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -6,8 +6,8 @@ using SafeTestsets: @safetestset
     filenames = filter(readdir(@__DIR__)) do f
       startswith("test_")(f) && endswith(".jl")(f)
     end
-    for dir in ["lib/", "arraytensor/", "ITensors/"]
-      push!(filenames, dir * "runtests.jl")
+    for dir in ["lib", "arraytensor", "ITensors"]
+      push!(filenames, joinpath(dir, "runtests.jl"))
     end
     @testset "Test $(@__DIR__)/$filename" for filename in filenames
       println("Running $(@__DIR__)/$filename")

From e6a68e58ea1e7b2f7846385788ffa176267fd8d0 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 18:58:08 -0500
Subject: [PATCH 48/73] Just bring name into namespace

---
 NDTensors/test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index d9a1f06608..872ec7f5b1 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -15,11 +15,11 @@ using SafeTestsets: @safetestset
     end
   end
   if "cuda" in ARGS || "all" in ARGS
-    using NDTensors
+    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorCUDA.jl"))
   end
   if "metal" in ARGS || "all" in ARGS
-    using NDTensors
+    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorMetal.jl"))
   end
 end

From edcb6c8901846acbf40d14bf976542de17c3a198 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 28 Nov 2023 18:59:24 -0500
Subject: [PATCH 49/73] Just use reshape

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 3beef2f733..cdf08326f0 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -67,11 +67,11 @@ function main()
   # Create a tuple of indices
   decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
   # Reshape the CuVector of data into a matrix
-  cuTensor_data = CUDA.reshape(array(grad[1]), decomp)
+  cuTensor_data = reshape(array(grad[1]), decomp)
   # Use cuBLAS to compute SVD of data
   U, S, V = svd(cuTensor_data)
   decomp = (dim(ind(grad[2], 1)), dim(ind(grad[2], 2)))
-  cuTensor_data = CUDA.reshape(array(grad[2]), decomp)
+  cuTensor_data = reshape(array(grad[2]), decomp)
   U, S, V = svd(cuTensor_data)
 
   # These things can take up lots of memory, look at memory usage here

From 246ecd49fc0c9f689503b5cb12cd91661b352dd2 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 12:07:31 -0500
Subject: [PATCH 50/73] Updates to NDTensorsMetal example

---
 NDTensors/ext/examples/NDTensorMetal.jl | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorMetal.jl b/NDTensors/ext/examples/NDTensorMetal.jl
index 6f670b3083..850e151ef7 100644
--- a/NDTensors/ext/examples/NDTensorMetal.jl
+++ b/NDTensors/ext/examples/NDTensorMetal.jl
@@ -1,9 +1,9 @@
-using Metal
+using Metal: MtlVector, mtl, @allowscalar
 using NDTensors
 
-using ITensors
-using Test
-using Zygote
+using ITensors: ITensor, Index, randomITensor
+using Test: @test
+using Zygote: gradient
 
 function main()
   # Here is an example of how to utilize NDTensors based tensors with CUDA datatypes
@@ -25,8 +25,6 @@ function main()
 
   @test A * B ≈ cpu(cC)
 
-  #C = A * B
-
   dim3 = (l, k)
   dim4 = (i,)
 
@@ -35,7 +33,8 @@ function main()
 
   f(A, B, C, D) = (A * B * C * D)[]
 
-  return grad = gradient(f, cA, cB, cC, cD)
+  grad = gradient(f, cA, cB, cC, cD)
+  @test grad[2] ≈ cA * cC * cD
 end
 
 main()

From ab237aa070b96a2938505a42ffc305ff16a6cb13 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 12:08:16 -0500
Subject: [PATCH 51/73] Remove unwrap_type

---
 NDTensors/ext/NDTensorsMetalExt/permutedims.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index f507ede599..9ed4dabb86 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -20,7 +20,7 @@ function Base.permutedims!(
   perm,
   f,
 )
-  Aperm = unwrap_type(Esrc)(reshape(permutedims(Esrc, perm), size(parent(Edest))))
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
   parent(Edest) .= f.(parent(Edest), Aperm)
   return unexpose(Edest)
 end

From 0abeb5abee9d77a61bbd77610a087ef1349a8164 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 12:08:29 -0500
Subject: [PATCH 52/73] format

---
 NDTensors/src/lib/Unwrap/test/runtests.jl | 30 +++++++++++++++++------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index 5b172e1606..ed3113fdbf 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -14,11 +14,7 @@ using LinearAlgebra:
   qr,
   svd
 using GPUArraysCore: @allowscalar
-include(
-  joinpath(
-    pkgdir(NDTensors), "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"
-  ),
-)
+include(joinpath(pkgdir(NDTensors), "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"))
 using .NDTensorsTestUtils: NDTensorsTestUtils
 
 @testset "Testing Unwrap $dev, $elt" for dev in NDTensorsTestUtils.devices_list(ARGS),
@@ -199,11 +195,29 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   C = dev(zeros(elt, 2, 12))
   mul!(expose(C), expose(B), expose(A), true, false)
   Cp = NDTensors.cpu(similar(C))
-  mul!(
-    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
-  )
+  mul!(expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false)
   @test NDTensors.cpu(C) ≈ Cp
   NDTensors.zero(C)
   NDTensors.mul!!(C, B, A, true, false)
   @test NDTensors.cpu(C) ≈ Cp
+
+  y = reshape(dev(randn(elt, 8))', 2, 4)
+  x = Base.ReshapedArray(dev(randn(elt, 8, 8)'[1:8]), (2, 4), ())
+  z = dev(fill!(Matrix{elt}(undef, (2, 4)), 0.0))
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = y[i, j] * x[i, j]
+    end
+  end
+  permutedims!(expose(y), expose(x), (1, 2), *)
+  @allowscalar @test reshape(z, size(y)) ≈ y
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = x[i, j] * y[i, j]
+    end
+  end
+  permutedims!(expose(x), expose(y), (1, 2), *)
+  ## I copy x here because it is a ReshapedArray{SubArray} which causes `≈`
+  ## to throw an error 
+  @test z ≈ copy(expose(x))
 end

From 4700b39a0565f0f88541011b971ab895c71bc7d0 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 12:53:51 -0500
Subject: [PATCH 53/73] calling permutedims(expose(A) ...) can cause a stack
 overflow.

---
 .../ext/NDTensorsMetalExt/permutedims.jl      | 12 ++++-
 NDTensors/src/lib/Unwrap/test/runtests.jl     | 53 ++++++++++++-------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index 9ed4dabb86..a960475d28 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -1,7 +1,8 @@
 function Base.permutedims(E::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm)
   A = copy(E)
-  return permutedims(expose(A), perm)
+  return permutedims(A, perm)
 end
+
 ## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray,<:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray}, perm
@@ -11,6 +12,15 @@ function Base.permutedims!(
   return unexpose(Edest)
 end
 
+## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
+function Base.permutedims!(
+  Edest::Exposed{<:MtlArray}, Esrc::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(Edest, expose(Aperm))
+  return unexpose(Edest)
+end
+
 ## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 ## To get around this copy and permute Esrc, reshape to the size of Edest's parent
 ## and broadcast into the parent.
diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index ed3113fdbf..cd07ae52b1 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -129,6 +129,40 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   @test NDTensors.cpu(y) == NDTensors.cpu(x)
   @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
 
+  ## Tests for Metal because permutedims with ReshapedArray does not work properly
+  ## transpose(ReshapedArray(MtlArray)) fails with scalar indexing so calling copy to 
+  ## evaluate tests in the following tests
+  y = dev(rand(elt, 4, 4))
+  @test permutedims(expose(y), (2,1)) == transpose(y)
+  y = Base.ReshapedArray(y, (2,8), ())
+  @test permutedims(expose(y), (2,1)) == transpose(copy(expose(y)))
+  yt = dev(rand(elt, (8,2)))
+  permutedims!(expose(y), expose(yt), (2,1))
+  @test copy(expose(y)) == transpose(yt)
+  yt = dev(rand(elt, 8,2))
+  permutedims!(expose(yt), expose(y), (2,1))
+  @test copy(expose(y)) == transpose(yt)
+
+  y = reshape(dev(randn(elt, 8))', 2, 4)
+  x = Base.ReshapedArray(dev(randn(elt, 8, 8)'[1:8]), (2, 4), ())
+  z = dev(fill!(Matrix{elt}(undef, (2, 4)), 0.0))
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = y[i, j] * x[i, j]
+    end
+  end
+  permutedims!(expose(y), expose(x), (1, 2), *)
+  @allowscalar @test reshape(z, size(y)) ≈ y
+  for i in 1:2
+    for j in 1:4
+      @allowscalar z[i, j] = x[i, j] * y[i, j]
+    end
+  end
+  permutedims!(expose(x), expose(y), (1, 2), *)
+  ## I copy x here because it is a ReshapedArray{SubArray} which causes `≈`
+  ## to throw an error 
+  @test z ≈ copy(expose(x))
+  
   y = dev(rand(elt, 4, 4))
   x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
   copyto!(expose(y), expose(x))
@@ -201,23 +235,4 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   NDTensors.mul!!(C, B, A, true, false)
   @test NDTensors.cpu(C) ≈ Cp
 
-  y = reshape(dev(randn(elt, 8))', 2, 4)
-  x = Base.ReshapedArray(dev(randn(elt, 8, 8)'[1:8]), (2, 4), ())
-  z = dev(fill!(Matrix{elt}(undef, (2, 4)), 0.0))
-  for i in 1:2
-    for j in 1:4
-      @allowscalar z[i, j] = y[i, j] * x[i, j]
-    end
-  end
-  permutedims!(expose(y), expose(x), (1, 2), *)
-  @allowscalar @test reshape(z, size(y)) ≈ y
-  for i in 1:2
-    for j in 1:4
-      @allowscalar z[i, j] = x[i, j] * y[i, j]
-    end
-  end
-  permutedims!(expose(x), expose(y), (1, 2), *)
-  ## I copy x here because it is a ReshapedArray{SubArray} which causes `≈`
-  ## to throw an error 
-  @test z ≈ copy(expose(x))
 end

From ac4f9c10497fd9a6eda7d01c52387d979d56baf7 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 12:54:06 -0500
Subject: [PATCH 54/73] format

---
 NDTensors/ext/NDTensorsMetalExt/permutedims.jl |  2 +-
 NDTensors/src/lib/Unwrap/test/runtests.jl      | 17 ++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index a960475d28..d92190891e 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -14,7 +14,7 @@ end
 
 ## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 function Base.permutedims!(
-  Edest::Exposed{<:MtlArray}, Esrc::Exposed{<:MtlArray, <:Base.ReshapedArray}, perm
+  Edest::Exposed{<:MtlArray}, Esrc::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm
 )
   Aperm = permutedims(Esrc, perm)
   copyto!(Edest, expose(Aperm))
diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index cd07ae52b1..6747736e75 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -133,14 +133,14 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   ## transpose(ReshapedArray(MtlArray)) fails with scalar indexing so calling copy to 
   ## evaluate tests in the following tests
   y = dev(rand(elt, 4, 4))
-  @test permutedims(expose(y), (2,1)) == transpose(y)
-  y = Base.ReshapedArray(y, (2,8), ())
-  @test permutedims(expose(y), (2,1)) == transpose(copy(expose(y)))
-  yt = dev(rand(elt, (8,2)))
-  permutedims!(expose(y), expose(yt), (2,1))
+  @test permutedims(expose(y), (2, 1)) == transpose(y)
+  y = Base.ReshapedArray(y, (2, 8), ())
+  @test permutedims(expose(y), (2, 1)) == transpose(copy(expose(y)))
+  yt = dev(rand(elt, (8, 2)))
+  permutedims!(expose(y), expose(yt), (2, 1))
   @test copy(expose(y)) == transpose(yt)
-  yt = dev(rand(elt, 8,2))
-  permutedims!(expose(yt), expose(y), (2,1))
+  yt = dev(rand(elt, 8, 2))
+  permutedims!(expose(yt), expose(y), (2, 1))
   @test copy(expose(y)) == transpose(yt)
 
   y = reshape(dev(randn(elt, 8))', 2, 4)
@@ -162,7 +162,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   ## I copy x here because it is a ReshapedArray{SubArray} which causes `≈`
   ## to throw an error 
   @test z ≈ copy(expose(x))
-  
+
   y = dev(rand(elt, 4, 4))
   x = @view dev(rand(elt, 8, 8))[1:4, 1:4]
   copyto!(expose(y), expose(x))
@@ -234,5 +234,4 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   NDTensors.zero(C)
   NDTensors.mul!!(C, B, A, true, false)
   @test NDTensors.cpu(C) ≈ Cp
-
 end

From 771403befb323e2b9519c59bca1d3c0bf4fed66c Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 13:01:26 -0500
Subject: [PATCH 55/73] Update mul.jl [no-ci]

---
 NDTensors/ext/NDTensorsMetalExt/mul.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index ad63d92656..76aff904b3 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -21,7 +21,7 @@ function LinearAlgebra.mul!(
 end
 
 ## Fix issue in Metal.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
-## as a CuArray and calls generic matmul
+## as a MtlArray and calls generic matmul
 function LinearAlgebra.mul!(
   CM::Exposed{<:MtlArray},
   AM::Exposed{<:MtlArray},

From c2b39a1dcdd21a827245e137ec19e19802c65a63 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 13:02:02 -0500
Subject: [PATCH 56/73] Update mul.jl [no-ci]

---
 NDTensors/ext/NDTensorsMetalExt/mul.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index 76aff904b3..2abf8e8cbf 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -20,7 +20,7 @@ function LinearAlgebra.mul!(
   return unexpose(CM)
 end
 
-## Fix issue in Metal.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
+## Fix issue in Metal.jl where it cannot distinguish Transpose{Reshape{Adjoint{MtlArray}}}
 ## as a MtlArray and calls generic matmul
 function LinearAlgebra.mul!(
   CM::Exposed{<:MtlArray},

From 919b0415c64f9e507663907e09e1ff553ba4ea86 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 13:05:30 -0500
Subject: [PATCH 57/73] Update documentation

---
 NDTensors/ext/NDTensorsMetalExt/permutedims.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
index d92190891e..40cc3f588c 100644
--- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl
@@ -1,9 +1,11 @@
+## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted using
+## permutedims (failing in that Metal uses scalar indexing)
+## These functions are to address the problem in different instances of permutedims
 function Base.permutedims(E::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm)
   A = copy(E)
   return permutedims(A, perm)
 end
 
-## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray,<:Base.ReshapedArray}, Esrc::Exposed{<:MtlArray}, perm
 )
@@ -12,7 +14,6 @@ function Base.permutedims!(
   return unexpose(Edest)
 end
 
-## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray}, Esrc::Exposed{<:MtlArray,<:Base.ReshapedArray}, perm
 )
@@ -21,8 +22,8 @@ function Base.permutedims!(
   return unexpose(Edest)
 end
 
-## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted
-## To get around this copy and permute Esrc, reshape to the size of Edest's parent
+## To get around the Metal issue here we copy and permute Esrc,
+## then we reshape Esrc to the size of Edest's parent
 ## and broadcast into the parent.
 function Base.permutedims!(
   Edest::Exposed{<:MtlArray,<:Base.ReshapedArray},

From d4621f5922cabc04288117a7c41a1773f74197e5 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 16:07:28 -0500
Subject: [PATCH 58/73] Add permutedims for CuArray failing

---
 NDTensors/ext/NDTensorsCUDAExt/permutedims.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
index 2dba8286bb..e51421fe4d 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -5,3 +5,16 @@ function Base.permutedims!(
   copyto!(expose(parent(Edest)), expose(Aperm))
   return unexpose(Edest)
 end
+
+## Found an issue in CUDA where if Edest is a reshaped{<:Adjoint}
+## .= can fail. So instead force Esrc into the shape of parent(Edest)
+function Base.permutedims!(
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray{<:Any, <:Any, <:Adjoint}},
+  Esrc::Exposed{<:CuArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end

From da62a14b50e5e0e343c9abdf78ae7633e32f03a9 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Wed, 29 Nov 2023 16:17:30 -0500
Subject: [PATCH 59/73] format

---
 NDTensors/ext/NDTensorsCUDAExt/permutedims.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
index e51421fe4d..d482841e27 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -9,7 +9,7 @@ end
 ## Found an issue in CUDA where if Edest is a reshaped{<:Adjoint}
 ## .= can fail. So instead force Esrc into the shape of parent(Edest)
 function Base.permutedims!(
-  Edest::Exposed{<:CuArray,<:Base.ReshapedArray{<:Any, <:Any, <:Adjoint}},
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
   Esrc::Exposed{<:CuArray},
   perm,
   f,

From cda274f35ca76b5a5470a34dff665ef59df99b9f Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 11:17:49 -0500
Subject: [PATCH 60/73] simplifications to NDTensorsCUDA example

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index cdf08326f0..4614c82a5e 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -7,6 +7,8 @@ using Zygote: gradient
 
 function main()
   # using ITensorGPU
+  cpu = NDTensors.cpu
+  gpu = NDTensors.cu
   # Here is an example of how to utilize NDTensors based tensors with CUDA datatypes
   i = Index(2)
   j = Index(5)
@@ -17,11 +19,9 @@ function main()
   dim2 = (j, k)
 
   # Create  2 ITensors with CUDA backends (These will be made simpiler by randomITensor(CuVector) soon)
-  A = ITensor(NDTensors.generic_randn(CuVector, dim(dim1)), dim1)
-  B = ITensor(NDTensors.generic_randn(CuVector, dim(dim2)), dim2)
+  A = ITensor(randomTensor(CuVector, dim1))
+  B = ITensor(randomTensor(CuVector, dim2))
   # Contract the two tensors
-  cpu = NDTensors.cpu
-  gpu = NDTensors.cu
   C = A * B
   A = cpu(A)
   B = cpu(B)
@@ -47,11 +47,8 @@ function main()
 
   dim3 = (l, k)
   dim4 = (i,)
-  cC = ITensor(
-    NDTensors.generic_randn(CuVector{Float64,CUDA.Mem.DeviceBuffer}, dim(dim3)), dim3
-  )
-  cC = gpu(ITensor(NDTensors.generic_randn(Vector{Float64}, dim(dim3)), dim3))
-  cD = ITensor(Tensor(CuVector, dim4))
+  cC = ITensor( randomTensor(CuVector{Float64, CUDA.Mem.DeviceBuffer}, dim3))
+  cD = ITensor(Tensor(CuVector{Float32}, dim4))
   fill!(cD, randn())
 
   # Create a function of 4 tensors on GPU
@@ -62,7 +59,7 @@ function main()
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
   grad = gradient(f, cA, cB, cC, cD)
-  @allowscalar @test cpu(cB * cC * cD) ≈ cpu(grad[1])
+  @test cpu(cB * cC * cD) ≈ cpu(grad[1])
   @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
   decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
@@ -70,7 +67,7 @@ function main()
   cuTensor_data = reshape(array(grad[1]), decomp)
   # Use cuBLAS to compute SVD of data
   U, S, V = svd(cuTensor_data)
-  decomp = (dim(ind(grad[2], 1)), dim(ind(grad[2], 2)))
+  decomp = size(array(grad[2]))
   cuTensor_data = reshape(array(grad[2]), decomp)
   U, S, V = svd(cuTensor_data)
 
@@ -85,7 +82,6 @@ function main()
 
   # Its possible to compute QR of GPU tensor
   cq = qr(cA, (i,), (j, l))
-  q = qr(A, (i,), (j, l))
   A ≈ cpu(cq[1]) * cpu(cq[2])
 
   ## SVD does not yet work with CUDA backend, see above on

From 5f29bf410fc7f4dfb9c8bf1f80aacb74ddf5bafc Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 11:19:12 -0500
Subject: [PATCH 61/73] Allowscalar shouldn't be necessary

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 4614c82a5e..9192efbce3 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -1,5 +1,5 @@
 using NDTensors
-using CUDA: CUDA, CuVector, @allowscalar, cu, reshape
+using CUDA: CUDA, CuVector, cu, reshape
 using ITensors:
   Index, ITensor, randomMPO, randomMPS, inner, orthogonalize, qr, siteinds, svd
 using Test: @test
@@ -60,7 +60,7 @@ function main()
   # generic implementation
   grad = gradient(f, cA, cB, cC, cD)
   @test cpu(cB * cC * cD) ≈ cpu(grad[1])
-  @allowscalar @test (cB * cC * cD) ≈ grad[1]
+  @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
   decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
   # Reshape the CuVector of data into a matrix

From 7ce440bf25565d5c585bc37a15783662193d7193 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 11:21:14 -0500
Subject: [PATCH 62/73] Simplifications to NDTensorsMetal tests

---
 NDTensors/ext/examples/NDTensorMetal.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorMetal.jl b/NDTensors/ext/examples/NDTensorMetal.jl
index 850e151ef7..95f0445f6c 100644
--- a/NDTensors/ext/examples/NDTensorMetal.jl
+++ b/NDTensors/ext/examples/NDTensorMetal.jl
@@ -1,4 +1,4 @@
-using Metal: MtlVector, mtl, @allowscalar
+using Metal: MtlVector, mtl
 using NDTensors
 
 using ITensors: ITensor, Index, randomITensor
@@ -6,6 +6,8 @@ using Test: @test
 using Zygote: gradient
 
 function main()
+  cpu = NDTensors.cpu
+  gpu = NDTensors.mtl
   # Here is an example of how to utilize NDTensors based tensors with CUDA datatypes
   i = Index(20)
   j = Index(5)
@@ -15,11 +17,11 @@ function main()
   dim1 = (i, j, l)
   dim2 = (j, k)
 
-  cA = ITensor(NDTensors.generic_randn(MtlVector{Float32}, dim(dim1)), dim1)
-  cB = ITensor(NDTensors.generic_randn(MtlVector{Float32}, dim(dim2)), dim2)
+  ## MtlArrays only support Float32 arithmatic
+  cA = ITensor(randomTensor(MtlVector{Float32}, dim1))
+  cB = ITensor(randomTensor(MtlVector{Float32}, dim2))
   cC = cA * cB
 
-  cpu = NDTensors.cpu
   A = cpu(cA)
   B = cpu(cB)
 
@@ -28,8 +30,8 @@ function main()
   dim3 = (l, k)
   dim4 = (i,)
 
-  cC = mtl(randomITensor(Float32, dim3))
-  cD = mtl(randomITensor(Float32, dim4))
+  cC = gpu(randomITensor(Float32, dim3))
+  cD = gpu(randomITensor(Float32, dim4))
 
   f(A, B, C, D) = (A * B * C * D)[]
 

From 6dd95d4e1ca18e91aab4753d85d526af6476e3f2 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 11:22:20 -0500
Subject: [PATCH 63/73] spelling

---
 NDTensors/src/dense/tensoralgebra/outer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
index d66d87a2a7..1e78fda1fb 100644
--- a/NDTensors/src/dense/tensoralgebra/outer.jl
+++ b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -22,7 +22,7 @@ function outer!(
   v2 = data(T2)
   RM = reshape(R, length(v1), length(v2))
   ## There is no _gemm! defined for CUDA or Metal so it calls 
-  ## generic matmul. Replace with mul!! to call correct mul!! (ger)
+  ## generic matmul. Replace with mul!! to call correct mul! (ger)
   mul!!(array(RM), v1, transpose(v2), one(ElR), zero(ElR))
   return R
 end

From 24989306a85c2ae3acc7973dd5703b49bb1af367 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 12:15:05 -0500
Subject: [PATCH 64/73] formatting

---
 NDTensors/ext/examples/NDTensorCUDA.jl        |  2 +-
 NDTensors/src/lib/Unwrap/test/runtests.jl     | 30 +++++++++----------
 .../test/ITensors/TestITensorDMRG/dmrg.jl     |  4 +--
 NDTensors/test/ITensors/runtests.jl           |  9 ++----
 NDTensors/test/test_blocksparse.jl            | 29 ++++++++----------
 NDTensors/test/test_combiner.jl               |  8 ++---
 NDTensors/test/test_dense.jl                  |  4 +--
 NDTensors/test/test_diag.jl                   |  6 ++--
 NDTensors/test/test_emptystorage.jl           |  4 +--
 NDTensors/test/test_linearalgebra.jl          | 16 +++++-----
 10 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 9192efbce3..1c169d29c9 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -47,7 +47,7 @@ function main()
 
   dim3 = (l, k)
   dim4 = (i,)
-  cC = ITensor( randomTensor(CuVector{Float64, CUDA.Mem.DeviceBuffer}, dim3))
+  cC = ITensor(randomTensor(CuVector{Float64,CUDA.Mem.DeviceBuffer}, dim3))
   cD = ITensor(Tensor(CuVector{Float32}, dim4))
   fill!(cD, randn())
 
diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index 6747736e75..055a9002da 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -15,9 +15,9 @@ using LinearAlgebra:
   svd
 using GPUArraysCore: @allowscalar
 include(joinpath(pkgdir(NDTensors), "test", "NDTensorsTestUtils", "NDTensorsTestUtils.jl"))
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list
 
-@testset "Testing Unwrap $dev, $elt" for dev in NDTensorsTestUtils.devices_list(ARGS),
+@testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
   elt in (Float32, ComplexF32)
 
   v = dev(randn(elt, 10))
@@ -126,8 +126,8 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   y = dev(rand(elt, 4, 4))
   x = Base.ReshapedArray(dev(rand(elt, 16)), (4, 4), ())
   copyto!(expose(y), expose(x))
-  @test NDTensors.cpu(y) == NDTensors.cpu(x)
-  @test NDTensors.cpu(copy(expose(x))) == NDTensors.cpu(x)
+  @test cpu(y) == cpu(x)
+  @test cpu(copy(expose(x))) == cpu(x)
 
   ## Tests for Metal because permutedims with ReshapedArray does not work properly
   ## transpose(ReshapedArray(MtlArray)) fails with scalar indexing so calling copy to 
@@ -188,7 +188,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
   x = dev(randn(elt, 4, 4))
   permutedims!(expose(y), expose(x), (2, 1))
-  @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+  @test cpu(y) == transpose(cpu(x))
 
   ##########################################
   ### Testing an issue with CUDA&Metal transpose/adjoint mul
@@ -198,7 +198,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   Cp = copy(C)
 
   ## This fails with scalar indexing
-  if dev != NDTensors.cpu
+  if dev != cpu
     @test_broken mul!(transpose(C), transpose(A), B, true, false)
   end
   mul!(C, transpose(B), A, true, false)
@@ -206,19 +206,19 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   @test C ≈ Cp
   Cp = zero(C)
   ## Try calling mul!! with transposes to verify that code works
-  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  Cpt = mul!!(transpose(Cp), transpose(A), B, true, false)
   @test transpose(Cpt) ≈ C
 
   Cp = zero(C)
   ## This fails with scalar indexing 
-  if dev != NDTensors.cpu
+  if dev != cpu
     @test_broken mul!(C', A', B, true, false)
   end
   mul!(C, B', A, true, false)
   mul!(expose(Cp'), expose(A'), expose(B), true, false)
   @test C ≈ Cp
   Cp = zero(C)
-  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  Cpt = mul!!(Cp', A', B, true, false)
   @test Cpt' ≈ C
 
   ##################################
@@ -228,10 +228,10 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
   B = dev(randn(elt, 2, 2))
   C = dev(zeros(elt, 2, 12))
   mul!(expose(C), expose(B), expose(A), true, false)
-  Cp = NDTensors.cpu(similar(C))
-  mul!(expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false)
-  @test NDTensors.cpu(C) ≈ Cp
-  NDTensors.zero(C)
-  NDTensors.mul!!(C, B, A, true, false)
-  @test NDTensors.cpu(C) ≈ Cp
+  Cp = cpu(similar(C))
+  mul!(expose(Cp), expose(cpu(B)), expose(cpu(A)), true, false)
+  @test cpu(C) ≈ Cp
+  zero(C)
+  mul!!(C, B, A, true, false)
+  @test cpu(C) ≈ Cp
 end
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
index efbd97d4ef..d997c088db 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
@@ -2,6 +2,7 @@ using ITensors: MPO, OpSum, dmrg, randomMPS, siteinds
 using Random: Random
 using Test: @test
 include("../../NDTensorsTestUtils/NDTensorsTestUtils.jl")
+using .NDTensorsTestUtils: default_rtol
 # TODO: Include file with `reference_energies`.
 
 function test_dmrg(
@@ -30,6 +31,5 @@ function test_dmrg(
 
   energy, psi = dmrg(H, psi0; nsweeps, cutoff, maxdim, noise, outputlevel)
 
-  @test energy ≈ reference_energies[N] rtol =
-    rtol_scale * NDTensorsTestUtils.default_rtol(elt)
+  @test energy ≈ reference_energies[N] rtol = rtol_scale * default_rtol(elt)
 end
diff --git a/NDTensors/test/ITensors/runtests.jl b/NDTensors/test/ITensors/runtests.jl
index 3b375232e0..373906bb70 100644
--- a/NDTensors/test/ITensors/runtests.jl
+++ b/NDTensors/test/ITensors/runtests.jl
@@ -4,16 +4,13 @@ using SafeTestsets: @safetestset
   using Test: @testset
   include("TestITensorDMRG/TestITensorDMRG.jl")
   include("../NDTensorsTestUtils/NDTensorsTestUtils.jl")
-  using .NDTensorsTestUtils: NDTensorsTestUtils
-  @testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in
-                                                         NDTensorsTestUtils.devices_list(
-      ARGS
-    ),
+  using .NDTensorsTestUtils: devices_list, is_supported_eltype
+  @testset "Test DMRG $dev, $conserve_qns, $elt, $N" for dev in devices_list(ARGS),
     conserve_qns in [false, true],
     elt in (Float32, ComplexF32, Float64, ComplexF64),
     N in [4, 10]
 
-    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     if TestITensorDMRG.is_broken(dev, elt, Val(conserve_qns))
diff --git a/NDTensors/test/test_blocksparse.jl b/NDTensors/test/test_blocksparse.jl
index 4a8e2f6aee..eefaf51734 100644
--- a/NDTensors/test/test_blocksparse.jl
+++ b/NDTensors/test/test_blocksparse.jl
@@ -1,20 +1,18 @@
 @eval module $(gensym())
 using NDTensors
-using LinearAlgebra: exp, Hermitian, svd
+using LinearAlgebra: Hermitian, exp, svd
 using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: default_rtol, devices_list, is_supported_eltype
 
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
 
-  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(
-      copy(ARGS)
-    ),
+  @testset "test device: $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
     elt in (Float32, Float64)
 
-    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     # Indices
@@ -233,8 +231,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     @test isblocknz(T, (2, 2))
   end
 
-  @testset "svd on $dev, eltype: $elt" for dev in
-                                           NDTensorsTestUtils.devices_list(copy(ARGS)),
+  @testset "svd on $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
     elt in (Float32, Float64)
 
     if dev == NDTensors.mtl && elt == Float64
@@ -245,7 +242,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = NDTensorsTestUtils.default_rtol(elt)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 2" begin
@@ -253,7 +250,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = NDTensorsTestUtils.default_rtol(elt)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 3" begin
@@ -261,7 +258,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = NDTensorsTestUtils.default_rtol(elt)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 4" begin
@@ -269,7 +266,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = NDTensorsTestUtils.default_rtol(elt)
+      atol = default_rtol(elt)
     end
 
     @testset "svd example 5" begin
@@ -277,7 +274,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       randn!(A)
       U, S, V = svd(A)
       @test @allowscalar array(U) * array(S) * array(V)' ≈ array(A)
-      atol = NDTensorsTestUtils.default_rtol(elt)
+      atol = default_rtol(elt)
     end
   end
 
@@ -286,7 +283,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
     randn!(A)
     expT = exp(A)
     @test array(expT) ≈ exp(array(A))
-    atol = NDTensorsTestUtils.default_rtol(elt)
+    atol = default_rtol(elt)
 
     # Hermitian case
     A = BlockSparseTensor(complex(elt), [(1, 1), (2, 2)], ([2, 2], [2, 2]))
@@ -297,9 +294,7 @@ using .NDTensorsTestUtils: NDTensorsTestUtils
       blockview(Ah, bA) .= b + b'
     end
     expTh = exp(Hermitian(Ah))
-    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = NDTensorsTestUtils.default_rtol(
-      eltype(Ah)
-    )
+    @test array(expTh) ≈ exp(Hermitian(array(Ah))) rtol = default_rtol(eltype(Ah))
 
     A = BlockSparseTensor{elt}([(2, 1), (1, 2)], [2, 2], [2, 2])
     @test_throws ErrorException exp(A)
diff --git a/NDTensors/test/test_combiner.jl b/NDTensors/test/test_combiner.jl
index 001e86a1aa..989b0701c9 100644
--- a/NDTensors/test/test_combiner.jl
+++ b/NDTensors/test/test_combiner.jl
@@ -3,18 +3,16 @@ using NDTensors
 using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 # Testing generic block indices
 using ITensors: QN, Index
 
 @testset "CombinerTensor basic functionality" begin
-  @testset "test device: $dev, eltype: $elt" for dev in NDTensorsTestUtils.devices_list(
-      copy(ARGS)
-    ),
+  @testset "test device: $dev, eltype: $elt" for dev in devices_list(copy(ARGS)),
     elt in (Float64, Float32)
 
-    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     @testset "Dense * Combiner" begin
diff --git a/NDTensors/test/test_dense.jl b/NDTensors/test/test_dense.jl
index d0872b0116..2180877e4a 100644
--- a/NDTensors/test/test_dense.jl
+++ b/NDTensors/test/test_dense.jl
@@ -3,10 +3,10 @@ using NDTensors
 using Test: @testset, @test, @test_throws, @test_broken
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list
 
 @testset "Dense Tensors" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS))
     elt = dev == NDTensors.mtl ? Float32 : Float64
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
diff --git a/NDTensors/test/test_diag.jl b/NDTensors/test/test_diag.jl
index ca36531886..835ebe66ca 100644
--- a/NDTensors/test/test_diag.jl
+++ b/NDTensors/test/test_diag.jl
@@ -3,13 +3,13 @@ using NDTensors
 using Test: @testset, @test, @test_throws
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 @testset "DiagTensor basic functionality" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS)),
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS)),
     elt in (Float32, ComplexF32, Float64, ComplexF64)
 
-    if dev == NDTensors.mtl && real(elt) ≠ Float32
+    if !is_supported_eltype(dev, elt)
       # Metal doesn't support double precision
       continue
     end
diff --git a/NDTensors/test/test_emptystorage.jl b/NDTensors/test/test_emptystorage.jl
index f4db78d409..1f82ae2a57 100644
--- a/NDTensors/test/test_emptystorage.jl
+++ b/NDTensors/test/test_emptystorage.jl
@@ -2,10 +2,10 @@
 using NDTensors
 using Test: @testset, @test
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list
 
 @testset "EmptyStorage test" begin
-  @testset "test device: $dev" for dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+  @testset "test device: $dev" for dev in devices_list(copy(ARGS))
     T = dev(Tensor(EmptyStorage(NDTensors.EmptyNumber), (2, 2)))
     @test size(T) == (2, 2)
     @test eltype(T) == NDTensors.EmptyNumber
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index e9e3332eb7..90cced04e9 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -4,7 +4,7 @@ using LinearAlgebra: Diagonal, qr, diag
 using Test: @testset, @test
 using GPUArraysCore: @allowscalar
 include("NDTensorsTestUtils/NDTensorsTestUtils.jl")
-using .NDTensorsTestUtils: NDTensorsTestUtils
+using .NDTensorsTestUtils: devices_list, is_supported_eltype
 
 @testset "random_orthog" begin
   n, m = 10, 4
@@ -32,10 +32,10 @@ end
     elt in (Float64, ComplexF64, Float32, ComplexF32),
     positive in [false, true],
     singular in [false, true],
-    dev in NDTensorsTestUtils.devices_list(copy(ARGS))
+    dev in devices_list(copy(ARGS))
 
     ## Skip Float64 on Metal
-    if !NDTensorsTestUtils.is_supported_eltype(dev, elt)
+    if !is_supported_eltype(dev, elt)
       continue
     end
     eps = Base.eps(real(elt)) * 100 #this is set rather tight, so if you increase/change m,n you may have open up the tolerance on eps.
@@ -53,9 +53,9 @@ end
     end
     Q, X = qx(A; positive=positive) #X is R or L.
     Ap = Q * X
-    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
-    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
-    @test NDTensors.cpu(array(Q) * array(Q)') ≈ Id atol = eps
+    @test cpu(A) ≈ cpu(Ap) atol = eps
+    @test cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @test cpu(array(Q) * array(Q)') ≈ Id atol = eps
     @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0
@@ -75,8 +75,8 @@ end
     end
     Q, X = qx(A; positive=positive)
     Ap = Q * X
-    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
-    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @test cpu(A) ≈ cpu(Ap) atol = eps
+    @test cpu(array(Q)' * array(Q)) ≈ Id atol = eps
     @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0

From 8a5a3d7fdea665bf6393db92fdcc0f3cfa406547 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 13:46:11 -0500
Subject: [PATCH 65/73] format

---
 NDTensors/test/arraytensor/runtests.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/NDTensors/test/arraytensor/runtests.jl b/NDTensors/test/arraytensor/runtests.jl
index 9b268694b1..a423524d05 100644
--- a/NDTensors/test/arraytensor/runtests.jl
+++ b/NDTensors/test/arraytensor/runtests.jl
@@ -1,8 +1,8 @@
 @eval module $(gensym())
-  using Test: @testset
-  @testset "Tensor wrapping AbstractArrays $(f)" for f in [
-    "array.jl", "blocksparsearray.jl", "diagonalarray.jl"
-  ]
-    include(f)
-  end
+using Test: @testset
+@testset "Tensor wrapping AbstractArrays $(f)" for f in [
+  "array.jl", "blocksparsearray.jl", "diagonalarray.jl"
+]
+  include(f)
+end
 end

From b56f8a421000e56c25fac910fc4c38d6400e4e8c Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 13:48:53 -0500
Subject: [PATCH 66/73] grab NDTensors.cpu

---
 NDTensors/test/test_linearalgebra.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index 90cced04e9..289b2c4625 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -1,5 +1,6 @@
 @eval module $(gensym())
 using NDTensors
+using NDTensors: cpu
 using LinearAlgebra: Diagonal, qr, diag
 using Test: @testset, @test
 using GPUArraysCore: @allowscalar

From 66045ae3e4c27f1d001b57b567cc08723e8a96c3 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 18:18:52 -0500
Subject: [PATCH 67/73] Simplify code

---
 NDTensors/ext/examples/NDTensorCUDA.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index 1c169d29c9..c78efa4416 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -62,7 +62,8 @@ function main()
   @test cpu(cB * cC * cD) ≈ cpu(grad[1])
   @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
-  decomp = (dim(ind(grad[1], 1)), dim(ind(grad[1], 2)) * dim(ind(grad[1], 3)))
+  dims = size(grad[1])
+  decomp = (dims[1], dims[2] * dims[3])
   # Reshape the CuVector of data into a matrix
   cuTensor_data = reshape(array(grad[1]), decomp)
   # Use cuBLAS to compute SVD of data

From a2d569df6b11419ed29bfe25a5f58347600ad677 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 30 Nov 2023 18:21:36 -0500
Subject: [PATCH 68/73] add random module to files its missing

---
 NDTensors/src/lib/BlockSparseArrays/test/runtests.jl | 2 ++
 NDTensors/src/lib/SetParameters/test/runtests.jl     | 4 +++-
 NDTensors/src/lib/SmallVectors/test/runtests.jl      | 4 +++-
 NDTensors/src/lib/SortedSets/test/runtests.jl        | 4 +++-
 NDTensors/src/lib/TagSets/test/runtests.jl           | 4 +++-
 NDTensors/src/lib/TensorAlgebra/test/runtests.jl     | 2 ++
 NDTensors/src/lib/Unwrap/test/runtests.jl            | 2 ++
 7 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl b/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
index 862a504388..9f8bc253e3 100644
--- a/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
+++ b/NDTensors/src/lib/BlockSparseArrays/test/runtests.jl
@@ -1,3 +1,4 @@
+@eval module $(gensym())
 using Test: @test, @testset, @test_broken
 using BlockArrays: BlockArrays, BlockRange, blocksize
 using Compat: allequal
@@ -101,3 +102,4 @@ include("TestBlockSparseArraysUtils.jl")
     @test Hermitian(Matrix(a)) * Matrix(u) ≈ Matrix(u) * Diagonal(Vector(d))
   end
 end
+end
diff --git a/NDTensors/src/lib/SetParameters/test/runtests.jl b/NDTensors/src/lib/SetParameters/test/runtests.jl
index 646a983115..6c5c5d25e2 100644
--- a/NDTensors/src/lib/SetParameters/test/runtests.jl
+++ b/NDTensors/src/lib/SetParameters/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @inferred, @test, @testset
 using NDTensors.SetParameters
 
 @testset "Test NDTensors.SetParameters" begin
@@ -152,3 +153,4 @@ using NDTensors.SetParameters
       Array{Float64,1}
   end
 end
+end
diff --git a/NDTensors/src/lib/SmallVectors/test/runtests.jl b/NDTensors/src/lib/SmallVectors/test/runtests.jl
index 4b1daac154..3f6330badf 100644
--- a/NDTensors/src/lib/SmallVectors/test/runtests.jl
+++ b/NDTensors/src/lib/SmallVectors/test/runtests.jl
@@ -1,5 +1,6 @@
+@eval module $(gensym())
 using NDTensors.SmallVectors
-using Test
+using Test: @inferred, @test, @testset, @test_broken
 
 using NDTensors.SmallVectors:
   setindex,
@@ -153,3 +154,4 @@ end
 # @testset "SmallVectors" test_smallvectors()
 # (new in Julia 1.9)
 test_smallvectors()
+end
diff --git a/NDTensors/src/lib/SortedSets/test/runtests.jl b/NDTensors/src/lib/SortedSets/test/runtests.jl
index 14061d17dd..882fa15e18 100644
--- a/NDTensors/src/lib/SortedSets/test/runtests.jl
+++ b/NDTensors/src/lib/SortedSets/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @test, @testset
 using NDTensors.SortedSets
 using NDTensors.SmallVectors
 
@@ -38,3 +39,4 @@ using NDTensors.SmallVectors
     @test ("a", 3) ∉ parent(s)
   end
 end
+end
diff --git a/NDTensors/src/lib/TagSets/test/runtests.jl b/NDTensors/src/lib/TagSets/test/runtests.jl
index cf7336bbc1..ee5c72a199 100644
--- a/NDTensors/src/lib/TagSets/test/runtests.jl
+++ b/NDTensors/src/lib/TagSets/test/runtests.jl
@@ -1,4 +1,5 @@
-using Test
+@eval module $(gensym())
+using Test: @test, @testset
 using NDTensors.TagSets
 using NDTensors.SortedSets
 using NDTensors.SmallVectors
@@ -31,3 +32,4 @@ using NDTensors.Dictionaries
     end
   end
 end
+end
diff --git a/NDTensors/src/lib/TensorAlgebra/test/runtests.jl b/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
index d5182591a8..21ae8f3fb5 100644
--- a/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
+++ b/NDTensors/src/lib/TensorAlgebra/test/runtests.jl
@@ -1,3 +1,4 @@
+@eval module $(gensym())
 using Combinatorics: permutations
 using LinearAlgebra: qr
 using NDTensors.TensorAlgebra: TensorAlgebra
@@ -49,3 +50,4 @@ using Test: @test, @test_broken, @testset
     @test a ≈ a′
   end
 end
+end
diff --git a/NDTensors/src/lib/Unwrap/test/runtests.jl b/NDTensors/src/lib/Unwrap/test/runtests.jl
index 055a9002da..0d1082601e 100644
--- a/NDTensors/src/lib/Unwrap/test/runtests.jl
+++ b/NDTensors/src/lib/Unwrap/test/runtests.jl
@@ -1,3 +1,4 @@
+@eval module $(gensym())
 using Test: @testset, @test, @test_broken
 using NDTensors.Unwrap
 using NDTensors: NDTensors, mul!!
@@ -235,3 +236,4 @@ using .NDTensorsTestUtils: devices_list
   mul!!(C, B, A, true, false)
   @test cpu(C) ≈ Cp
 end
+end

From d85ac43c870c03d43c8a3997d1dcd6918e777d18 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 1 Dec 2023 09:11:49 -0500
Subject: [PATCH 69/73] Move readwrite to `NDTensors/tes/broken`

---
 NDTensors/test/{ => broken}/readwrite.jl | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename NDTensors/test/{ => broken}/readwrite.jl (100%)

diff --git a/NDTensors/test/readwrite.jl b/NDTensors/test/broken/readwrite.jl
similarity index 100%
rename from NDTensors/test/readwrite.jl
rename to NDTensors/test/broken/readwrite.jl

From ce0caba8ec7a78d06be739d978d7672f9326e556 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 1 Dec 2023 09:42:32 -0500
Subject: [PATCH 70/73] Remove @eval module...

---
 NDTensors/src/lib/SmallVectors/test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NDTensors/src/lib/SmallVectors/test/runtests.jl b/NDTensors/src/lib/SmallVectors/test/runtests.jl
index 3f6330badf..fbed402846 100644
--- a/NDTensors/src/lib/SmallVectors/test/runtests.jl
+++ b/NDTensors/src/lib/SmallVectors/test/runtests.jl
@@ -1,4 +1,3 @@
-@eval module $(gensym())
 using NDTensors.SmallVectors
 using Test: @inferred, @test, @testset, @test_broken
 

From ffbb01cda23bf2e6119a6b8183a311993bb481e6 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 1 Dec 2023 09:42:53 -0500
Subject: [PATCH 71/73] remove extra end

---
 NDTensors/src/lib/SmallVectors/test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NDTensors/src/lib/SmallVectors/test/runtests.jl b/NDTensors/src/lib/SmallVectors/test/runtests.jl
index fbed402846..d86bfd1060 100644
--- a/NDTensors/src/lib/SmallVectors/test/runtests.jl
+++ b/NDTensors/src/lib/SmallVectors/test/runtests.jl
@@ -153,4 +153,3 @@ end
 # @testset "SmallVectors" test_smallvectors()
 # (new in Julia 1.9)
 test_smallvectors()
-end

From bc7c400c6f2b90103653c6004b7d9147a9218b77 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Fri, 1 Dec 2023 13:42:15 -0500
Subject: [PATCH 72/73] Use TupleTools.sort

---
 NDTensors/src/blocksparse/blocksparsetensor.jl | 4 ++--
 NDTensors/src/tupletools.jl                    | 2 +-
 src/mps/dmrg.jl                                | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 435983039c..db2b6d85e9 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -446,7 +446,7 @@ function permutedims_combine_output(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
 
   # Permute the nonzero blocks (dimension-wise)
   blocks = nzblocks(T)
@@ -481,7 +481,7 @@ function permutedims_combine(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
   comb_ind_loc = minimum(combdims_perm)
 
   # Determine the new index before combining
diff --git a/NDTensors/src/tupletools.jl b/NDTensors/src/tupletools.jl
index 8e9874bfb5..395f0619f5 100644
--- a/NDTensors/src/tupletools.jl
+++ b/NDTensors/src/tupletools.jl
@@ -151,7 +151,7 @@ end
 
 deleteat(t::Tuple, I::Tuple{Int}) = deleteat(t, I[1])
 function deleteat(t::Tuple, I::Tuple{Int,Int,Vararg{Int}})
-  return deleteat_sorted(t, sort(I; rev=true))
+  return deleteat_sorted(t, TupleTools.sort(I; rev=true))
 end
 
 deleteat_sorted(t::Tuple, pos::Int64) = deleteat(t, pos[1])
diff --git a/src/mps/dmrg.jl b/src/mps/dmrg.jl
index eda11f5292..49444e606e 100644
--- a/src/mps/dmrg.jl
+++ b/src/mps/dmrg.jl
@@ -10,7 +10,7 @@ function permute(
   for n in 1:length(M)
     lₙ₋₁ = linkind(M, n - 1)
     lₙ = linkind(M, n)
-    s⃗ₙ = sort(Tuple(siteinds(M, n)); by=plev)
+    s⃗ₙ = NDTensors.TupleTools.sort(Tuple(siteinds(M, n)); by=plev)
     M̃[n] = permute(M[n], filter(!isnothing, (lₙ₋₁, s⃗ₙ..., lₙ)))
   end
   set_ortho_lims!(M̃, ortho_lims(M))

From 3eadb7b22deb6981c60e655541de1f9092cab791 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 1 Dec 2023 16:54:11 -0500
Subject: [PATCH 73/73] use `TupleTools.sort`

---
 .../blocksparsearray/storage/combiner/contract_combine.jl     | 4 ++--
 src/ITensors.jl                                               | 1 +
 src/physics/fermions.jl                                       | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl b/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
index c0d4db7654..8e6a4a2cee 100644
--- a/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
+++ b/NDTensors/src/arraystorage/blocksparsearray/storage/combiner/contract_combine.jl
@@ -49,7 +49,7 @@ function permutedims_combine(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
   comb_ind_loc = minimum(combdims_perm)
 
   # Determine the new index before combining
@@ -117,7 +117,7 @@ function permutedims_combine_output(
 
   # Now that the indices are permuted, compute
   # which indices are now combined
-  combdims_perm = sort(_permute_combdims(combdims, perm))
+  combdims_perm = TupleTools.sort(_permute_combdims(combdims, perm))
 
   # Permute the nonzero blocks (dimension-wise)
   blocks = nzblocks(a_src)
diff --git a/src/ITensors.jl b/src/ITensors.jl
index bc96ecfc81..e91fb6959d 100644
--- a/src/ITensors.jl
+++ b/src/ITensors.jl
@@ -71,6 +71,7 @@ using Random
 using SerializedElementArrays
 using StaticArrays
 using TimerOutputs
+using TupleTools
 using Zeros
 
 #####################################
diff --git a/src/physics/fermions.jl b/src/physics/fermions.jl
index 262721746e..289576436a 100644
--- a/src/physics/fermions.jl
+++ b/src/physics/fermions.jl
@@ -157,8 +157,8 @@ end
   # may be a tuple of QNIndex, so convert to a Vector{Index}
   indsR = collect(input_indsR)
 
-  nlabelsT1 = NDTensors.sort(labelsT1; rev=true)
-  nlabelsT2 = NDTensors.sort(labelsT2)
+  nlabelsT1 = TupleTools.sort(labelsT1; rev=true)
+  nlabelsT2 = TupleTools.sort(labelsT2)
 
   # Make orig_labelsR from the order of
   # indices that would result by just