From 6bb7f06ec14981b2acfdfcbcdf1e19b393b938d1 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Fri, 15 Nov 2024 18:40:43 +0000
Subject: [PATCH 01/15] Fix tolerances and add test

---
 cpp/basix/finite-element.cpp |  2 +-
 cpp/basix/finite-element.h   |  1 -
 test/test_lagrange.py        | 13 +++++++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/cpp/basix/finite-element.cpp b/cpp/basix/finite-element.cpp
index e97b804c7..7b8e92ca0 100644
--- a/cpp/basix/finite-element.cpp
+++ b/cpp/basix/finite-element.cpp
@@ -965,7 +965,7 @@ FiniteElement<F>::FiniteElement(
           rtot += r;
         }
 
-        constexpr F eps = 10.0 * std::numeric_limits<float>::epsilon();
+        const F eps = 10 * degree * degree * std::numeric_limits<F>::epsilon();
         if ((trans.extent(2) != 1 and std::abs(rmin) > eps)
             or std::abs(rmax - 1.0) > eps or std::abs(rtot - 1.0) > eps)
         {
diff --git a/cpp/basix/finite-element.h b/cpp/basix/finite-element.h
index e65ad8769..9fe29e9e4 100644
--- a/cpp/basix/finite-element.h
+++ b/cpp/basix/finite-element.h
@@ -26,7 +26,6 @@
 /// Basix: FEniCS runtime basis evaluation library
 namespace basix
 {
-
 namespace impl
 {
 template <typename T, std::size_t d>
diff --git a/test/test_lagrange.py b/test/test_lagrange.py
index c9db0e336..0d5da1ac2 100644
--- a/test/test_lagrange.py
+++ b/test/test_lagrange.py
@@ -949,6 +949,19 @@ def test_dof_transformations_tetrahedron(degree):
         assert np.allclose(t, actual)
 
 
+@pytest.mark.parametrize(
+    "celltype",
+    [basix.CellType.interval, basix.CellType.triangle, basix.CellType.tetrahedron],
+)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_lagrange_transform(celltype, dtype):
+    for p in range(1, 15):
+        e = basix.create_element(
+            basix.ElementFamily.P, celltype, p, basix.LagrangeVariant.gll_warped, dtype=dtype
+        )
+        assert e.dof_transformations_are_permutations
+
+
 @pytest.mark.parametrize("degree", [1, 2, 3, 4])
 @pytest.mark.parametrize(
     "celltype",

From 1868dd3c4e4266bc1b017b814f2837de4d07e863 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Fri, 15 Nov 2024 18:46:31 +0000
Subject: [PATCH 02/15] Make eps consistent

---
 cpp/basix/finite-element.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/basix/finite-element.cpp b/cpp/basix/finite-element.cpp
index 7b8e92ca0..f2da5a08e 100644
--- a/cpp/basix/finite-element.cpp
+++ b/cpp/basix/finite-element.cpp
@@ -1436,7 +1436,7 @@ FiniteElement<F>::FiniteElement(
     for (std::size_t col = 0; col < matM.extent(1); ++col)
     {
       F v = col == row ? 1.0 : 0.0;
-      constexpr F eps = 100 * std::numeric_limits<F>::epsilon();
+      const F eps = 10 * degree * degree * std::numeric_limits<F>::epsilon();
       if (std::abs(matM(row, col) - v) > eps)
       {
         _interpolation_is_identity = false;

From eec248988cc3ff5f07b7226b9423a0255481f6fb Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Fri, 15 Nov 2024 22:09:39 +0000
Subject: [PATCH 03/15] Major speed improvement

---
 cpp/basix/dof-transformations.cpp | 152 +++++++++++++++++-------------
 cpp/basix/finite-element.cpp      |   1 +
 test/test_lagrange.py             |  18 +++-
 3 files changed, 102 insertions(+), 69 deletions(-)

diff --git a/cpp/basix/dof-transformations.cpp b/cpp/basix/dof-transformations.cpp
index f842ac5ca..e06caeb96 100644
--- a/cpp/basix/dof-transformations.cpp
+++ b/cpp/basix/dof-transformations.cpp
@@ -13,6 +13,8 @@
 #include <span>
 #include <tuple>
 
+#include <iostream>
+
 using namespace basix;
 
 namespace stdex
@@ -39,8 +41,7 @@ int find_first_subentity(cell::type cell_type, cell::type entity_type)
 {
   const int edim = cell::topological_dimension(entity_type);
   std::vector<cell::type> entities = cell::subentity_types(cell_type)[edim];
-  if (auto it = std::ranges::find(entities, entity_type);
-      it != entities.end())
+  if (auto it = std::ranges::find(entities, entity_type); it != entities.end())
   {
     return std::distance(entities.begin(), it);
   }
@@ -93,9 +94,7 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
   {
     mapinfo_t<T> mapinfo;
     auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-    auto map = [](auto pt) -> std::array<T, 3> {
-      return {pt[1], pt[0], 0.0};
-    };
+    auto map = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], 0.0}; };
     stdex::mdarray<T, stdex::extents<std::size_t, 2, 2>> J(
         stdex::extents<std::size_t, 2, 2>{}, {0., 1., 1., 0.});
 
@@ -109,9 +108,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
   {
     mapinfo_t<T> mapinfo;
     auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-    auto map = [](auto pt) -> std::array<T, 3> {
-      return {1 - pt[0], pt[1], 0};
-    };
+    auto map
+        = [](auto pt) -> std::array<T, 3> { return {1 - pt[0], pt[1], 0}; };
     stdex::mdarray<T, stdex::extents<std::size_t, 2, 2>> J(
         stdex::extents<std::size_t, 2, 2>{}, {-1., 0., 0., 1.});
 
@@ -126,9 +124,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     mapinfo_t<T> mapinfo;
     {
       auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-      auto map = [](auto pt) -> std::array<T, 3> {
-        return {pt[0], pt[2], pt[1]};
-      };
+      auto map
+          = [](auto pt) -> std::array<T, 3> { return {pt[0], pt[2], pt[1]}; };
       stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
           stdex::extents<std::size_t, 3, 3>{},
           {1., 0., 0., 0., 0., 1., 0., 1., 0.});
@@ -143,9 +140,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::triangle).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[2], pt[0], pt[1]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[2], pt[0], pt[1]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., 0., 0., 1., 1., 0., 0.});
@@ -157,9 +153,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[0], pt[2], pt[1]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[0], pt[2], pt[1]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {1., 0., 0., 0., 0., 1., 0., 1., 0.});
@@ -179,9 +174,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     mapinfo_t<T> mapinfo;
     {
       auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-      auto map = [](auto pt) -> std::array<T, 3> {
-        return {1 - pt[0], pt[1], pt[2]};
-      };
+      auto map = [](auto pt) -> std::array<T, 3>
+      { return {1 - pt[0], pt[1], pt[2]}; };
       stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
           stdex::extents<std::size_t, 3, 3>{},
           {-1., 0., 0., 0., 1., 0., 0., 0., 1.});
@@ -196,9 +190,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::quadrilateral).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {1 - pt[1], pt[0], pt[2]};
-        };
+        auto map = [](auto pt) -> std::array<T, 3>
+        { return {1 - pt[1], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., -1., 0., 0., 0., 0., 1.});
@@ -210,9 +203,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[1], pt[0], pt[2]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., 1., 0., 0., 0., 0., 1.});
@@ -231,9 +223,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     mapinfo_t<T> mapinfo;
     {
       auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-      auto map = [](auto pt) -> std::array<T, 3> {
-        return {1 - pt[0], pt[1], pt[2]};
-      };
+      auto map = [](auto pt) -> std::array<T, 3>
+      { return {1 - pt[0], pt[1], pt[2]}; };
       stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
           stdex::extents<std::size_t, 3, 3>{},
           {-1., 0., 0., 0., 1., 0., 0., 0., 1.});
@@ -246,9 +237,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::triangle).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {1 - pt[1] - pt[0], pt[0], pt[2]};
-        };
+        auto map = [](auto pt) -> std::array<T, 3>
+        { return {1 - pt[1] - pt[0], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., -1., -1., 0., 0., 0., 1.});
@@ -259,9 +249,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[1], pt[0], pt[2]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., 1., 0., 0., 0., 0., 1.});
@@ -275,9 +264,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::quadrilateral).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {1 - pt[2], pt[1], pt[0]};
-        };
+        auto map = [](auto pt) -> std::array<T, 3>
+        { return {1 - pt[2], pt[1], pt[0]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 0., 1., 0., 1., 0., -1., 0., 0.});
@@ -288,9 +276,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       { // scope
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[2], pt[1], pt[0]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[2], pt[1], pt[0]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 0., 1., 0., 1., 0., 1., 0., 0.});
@@ -309,9 +296,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     mapinfo_t<T> mapinfo;
     {
       auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-      auto map = [](auto pt) -> std::array<T, 3> {
-        return {1 - pt[0], pt[1], pt[2]};
-      };
+      auto map = [](auto pt) -> std::array<T, 3>
+      { return {1 - pt[0], pt[1], pt[2]}; };
       stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
           stdex::extents<std::size_t, 3, 3>{},
           {-1., 0., 0., 0., 1., 0., 0., 0., 1.});
@@ -325,9 +311,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::quadrilateral).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {1 - pt[1], pt[0], pt[2]};
-        };
+        auto map = [](auto pt) -> std::array<T, 3>
+        { return {1 - pt[1], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., -1., 0., 0., 0., 0., 1.});
@@ -338,9 +323,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[1], pt[0], pt[2]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], pt[2]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 1., 0., 1., 0., 0., 0., 0., 1.});
@@ -355,9 +339,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
     {
       auto& data = mapinfo.try_emplace(cell::type::triangle).first->second;
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {1 - pt[2] - pt[0], pt[1], pt[0]};
-        };
+        auto map = [](auto pt) -> std::array<T, 3>
+        { return {1 - pt[2] - pt[0], pt[1], pt[0]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 0., 1., 0., 1., 0., -1., 0., -1.});
@@ -368,9 +351,8 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
         data.push_back(std::tuple(map, J, detJ, K));
       }
       {
-        auto map = [](auto pt) -> std::array<T, 3> {
-          return {pt[2], pt[1], pt[0]};
-        };
+        auto map
+            = [](auto pt) -> std::array<T, 3> { return {pt[2], pt[1], pt[0]}; };
         stdex::mdarray<T, stdex::extents<std::size_t, 3, 3>> J(
             stdex::extents<std::size_t, 3, 3>{},
             {0., 0., 1., 0., 1., 0., 1., 0., 0.});
@@ -426,7 +408,7 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
   mdarray_t<T, 2> mapped_pts(pts.extents());
   for (std::size_t p = 0; p < mapped_pts.extent(0); ++p)
   {
-    auto mp = map_point(
+    std::array<T, 3> mp = map_point(
         std::span(pts.data_handle() + p * pts.extent(1), pts.extent(1)));
     for (std::size_t k = 0; k < mapped_pts.extent(1); ++k)
       mapped_pts(p, k) = mp[k];
@@ -439,18 +421,56 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
   mdspan_t<const T, 2> polyset_vals(polyset_vals_b.data(), polyset_shape[1],
                                     polyset_shape[2]);
 
-  mdarray_t<T, 3> tabulated_data(npts, total_ndofs, vs);
+  std::vector<T> tabulated_data_b(npts * total_ndofs * vs);
+  mdspan_t<T, 3> tabulated_data(tabulated_data_b.data(), npts, total_ndofs, vs);
+  // mdarray_t<T, 3> tabulated_data(npts, total_ndofs, vs);
+
+  // std::vector<T> result_b(polyset_vals.extent(1) * coeffs.extent(0));
+  // mdspan_t<T, 2> result(result_b.data(), polyset_vals.extent(1),
+  //                       coeffs.extent(0));
+
+  std::vector<T> resultT_b(polyset_vals.extent(1) * coeffs.extent(0));
+  mdspan_t<T, 2> resultT(resultT_b.data(), coeffs.extent(0),
+                         polyset_vals.extent(1));
+  // mdarray_t<T, 2> result(polyset_vals.extent(1), coeffs.extent(0));
+
+  // std::cout << "3: transform: " << vs << ", " << coeffs.extent(0) << ", "
+  //           << polyset_vals.extent(1) << ", " << polyset_vals.extent(0)
+  //           << std::endl;
+
+  std::vector<T> coeffs_b(coeffs.extent(0) * polyset_vals.extent(0));
+  mdspan_t<T, 2> _coeffs(coeffs_b.data(), coeffs.extent(0),
+                         polyset_vals.extent(0));
   for (std::size_t j = 0; j < vs; ++j)
   {
-    mdarray_t<T, 2> result(polyset_vals.extent(1), coeffs.extent(0));
+    // std::fill(result_b.begin(), result_b.end(), 0);
+
     for (std::size_t k0 = 0; k0 < coeffs.extent(0); ++k0)
-      for (std::size_t k1 = 0; k1 < polyset_vals.extent(1); ++k1)
-        for (std::size_t k2 = 0; k2 < polyset_vals.extent(0); ++k2)
-          result(k1, k0) += coeffs(k0, k2 + psize * j) * polyset_vals(k2, k1);
+      for (std::size_t k2 = 0; k2 < polyset_vals.extent(0); ++k2)
+        _coeffs(k0, k2) = coeffs(k0, k2 + psize * j);
+
+    // R^T = coeffs * polyset_vals
+    // for (std::size_t k0 = 0; k0 < coeffs.extent(0); ++k0) // big
+    //   for (std::size_t k1 = 0; k1 < polyset_vals.extent(1); ++k1)
+    //     for (std::size_t k2 = 0; k2 < polyset_vals.extent(0); ++k2) // big
+    //       result(k1, k0) += _coeffs(k0, k2) * polyset_vals(k2, k1);
+    // result(k1, k0) += coeffs(k0, k2 + psize * j) * polyset_vals(k2, k1);
+
+    // r^t: coeffs.extent(0) x polyset_vals.extent(1) [k0, k1]
+    // c: coeffs.extent(1) x polyset_vals.extent(0)   [k0, k2]
+    // p: polyset_vals.extent(0) x polyset_vals.extent(1)  [k2, k1]
+
+    math::dot(_coeffs, polyset_vals, resultT);
+
+    // std::cout << "4: transform" << std::endl;
+
+    for (std::size_t k0 = 0; k0 < resultT.extent(1); ++k0)
+      for (std::size_t k1 = 0; k1 < resultT.extent(0); ++k1)
+        tabulated_data(k0, k1, j) = resultT(k1, k0);
 
-    for (std::size_t k0 = 0; k0 < result.extent(0); ++k0)
-      for (std::size_t k1 = 0; k1 < result.extent(1); ++k1)
-        tabulated_data(k0, k1, j) = result(k0, k1);
+    // for (std::size_t k0 = 0; k0 < result.extent(0); ++k0)
+    //   for (std::size_t k1 = 0; k1 < result.extent(1); ++k1)
+    //     tabulated_data(k0, k1, j) = result(k0, k1);
   }
 
   // push forward
@@ -460,7 +480,7 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
     for (std::size_t i = 0; i < npts; ++i)
     {
       mdspan_t<const T, 2> tab(
-          tabulated_data.data()
+          tabulated_data_b.data()
               + i * tabulated_data.extent(1) * tabulated_data.extent(2),
           tabulated_data.extent(1), tabulated_data.extent(2));
 
diff --git a/cpp/basix/finite-element.cpp b/cpp/basix/finite-element.cpp
index f2da5a08e..411a61fd2 100644
--- a/cpp/basix/finite-element.cpp
+++ b/cpp/basix/finite-element.cpp
@@ -23,6 +23,7 @@
 #include <concepts>
 #include <limits>
 #include <numeric>
+
 #define str_macro(X) #X
 #define str(X) str_macro(X)
 
diff --git a/test/test_lagrange.py b/test/test_lagrange.py
index 0d5da1ac2..ca0f4bc35 100644
--- a/test/test_lagrange.py
+++ b/test/test_lagrange.py
@@ -951,11 +951,23 @@ def test_dof_transformations_tetrahedron(degree):
 
 @pytest.mark.parametrize(
     "celltype",
-    [basix.CellType.interval, basix.CellType.triangle, basix.CellType.tetrahedron],
+    [
+        basix.CellType.interval,
+        basix.CellType.triangle,
+        basix.CellType.tetrahedron,
+        basix.CellType.quadrilateral,
+        basix.CellType.hexahedron,
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+    ],
 )
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_lagrange_transform(celltype, dtype):
-    for p in range(1, 15):
+    for p in range(1, 12):
         e = basix.create_element(
             basix.ElementFamily.P, celltype, p, basix.LagrangeVariant.gll_warped, dtype=dtype
         )

From ff1dfaa7f49f607eb1203e5930effb4ad507dc0e Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Fri, 15 Nov 2024 22:53:23 +0000
Subject: [PATCH 04/15] Tidy up

---
 cpp/basix/dof-transformations.cpp | 32 +++++++++++++------------------
 cpp/basix/math.h                  |  6 ++++++
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/cpp/basix/dof-transformations.cpp b/cpp/basix/dof-transformations.cpp
index e06caeb96..03b4a77cf 100644
--- a/cpp/basix/dof-transformations.cpp
+++ b/cpp/basix/dof-transformations.cpp
@@ -423,21 +423,9 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
 
   std::vector<T> tabulated_data_b(npts * total_ndofs * vs);
   mdspan_t<T, 3> tabulated_data(tabulated_data_b.data(), npts, total_ndofs, vs);
-  // mdarray_t<T, 3> tabulated_data(npts, total_ndofs, vs);
-
-  // std::vector<T> result_b(polyset_vals.extent(1) * coeffs.extent(0));
-  // mdspan_t<T, 2> result(result_b.data(), polyset_vals.extent(1),
-  //                       coeffs.extent(0));
-
-  std::vector<T> resultT_b(polyset_vals.extent(1) * coeffs.extent(0));
-  mdspan_t<T, 2> resultT(resultT_b.data(), coeffs.extent(0),
-                         polyset_vals.extent(1));
-  // mdarray_t<T, 2> result(polyset_vals.extent(1), coeffs.extent(0));
-
-  // std::cout << "3: transform: " << vs << ", " << coeffs.extent(0) << ", "
-  //           << polyset_vals.extent(1) << ", " << polyset_vals.extent(0)
-  //           << std::endl;
-
+  std::vector<T> result_b(polyset_vals.extent(1) * coeffs.extent(0));
+  mdspan_t<T, 2> result(result_b.data(), coeffs.extent(0),
+                        polyset_vals.extent(1));
   std::vector<T> coeffs_b(coeffs.extent(0) * polyset_vals.extent(0));
   mdspan_t<T, 2> _coeffs(coeffs_b.data(), coeffs.extent(0),
                          polyset_vals.extent(0));
@@ -460,13 +448,13 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
     // c: coeffs.extent(1) x polyset_vals.extent(0)   [k0, k2]
     // p: polyset_vals.extent(0) x polyset_vals.extent(1)  [k2, k1]
 
-    math::dot(_coeffs, polyset_vals, resultT);
+    math::dot(_coeffs, polyset_vals, result);
 
     // std::cout << "4: transform" << std::endl;
 
-    for (std::size_t k0 = 0; k0 < resultT.extent(1); ++k0)
-      for (std::size_t k1 = 0; k1 < resultT.extent(0); ++k1)
-        tabulated_data(k0, k1, j) = resultT(k1, k0);
+    for (std::size_t k0 = 0; k0 < result.extent(1); ++k0)
+      for (std::size_t k1 = 0; k1 < result.extent(0); ++k1)
+        tabulated_data(k0, k1, j) = result(k1, k0);
 
     // for (std::size_t k0 = 0; k0 < result.extent(0); ++k0)
     //   for (std::size_t k1 = 0; k1 < result.extent(1); ++k1)
@@ -502,10 +490,16 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
     for (std::size_t i = 0; i < vs; ++i)
     {
       for (std::size_t k0 = 0; k0 < transform.extent(1); ++k0)
+      {
         for (std::size_t k1 = 0; k1 < transform.extent(0); ++k1)
+        {
           for (std::size_t k2 = 0; k2 < imat.extent(2); ++k2)
+          {
             transform(k1, k0)
                 += imat(k0, i, k2, d) * pushed_data(k2, k1 + dofstart, i);
+          }
+        }
+      }
     }
   }
 
diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 82cb86ff9..1199941b3 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -330,6 +330,12 @@ void dot(const U& A, const V& B, W&& C)
   }
   else
   {
+    static_assert(std::is_same_v<typename U::layout_type, std::layout_right>);
+    static_assert(std::is_same_v<typename V::layout_type, std::layout_right>);
+    // static_assert(
+    //     std::is_same_v<typename std::remove_cv<typename W::layout_type>,
+    //                    std::layout_right>);
+
     using T = typename std::decay_t<U>::value_type;
     impl::dot_blas<T>(
         std::span(A.data_handle(), A.size()), {A.extent(0), A.extent(1)},

From 6260efa4dfcd631f7e356511828e05bd157822ee Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 09:49:59 +0000
Subject: [PATCH 05/15] Add assertions

---
 cpp/basix/math.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 1199941b3..074f81391 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <concepts>
@@ -330,13 +331,22 @@ void dot(const U& A, const V& B, W&& C)
   }
   else
   {
-    static_assert(std::is_same_v<typename U::layout_type, std::layout_right>);
-    static_assert(std::is_same_v<typename V::layout_type, std::layout_right>);
-    // static_assert(
+    static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
+                                 std::layout_right>);
+    static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,
+                                 std::layout_right>);
+    static_assert(std::is_same_v<typename std::decay_t<W>::layout_type,
+                                 std::layout_right>);
+
+    // static_assert(std::is_same_v<typename U::layout_type,
+    // std::layout_right>); static_assert(std::is_same_v<typename
+    // V::layout_type, std::layout_right>); static_assert(
     //     std::is_same_v<typename std::remove_cv<typename W::layout_type>,
     //                    std::layout_right>);
 
     using T = typename std::decay_t<U>::value_type;
+    // static_assert(std::is_same_v<typename std::decay_t<V>::value_type, T>);
+    // static_assert(std::is_same_v<typename std::decay_t<W>::value_type, T>);
     impl::dot_blas<T>(
         std::span(A.data_handle(), A.size()), {A.extent(0), A.extent(1)},
         std::span(B.data_handle(), B.size()), {B.extent(0), B.extent(1)},

From 869189c6439798bd7216590f22ad778d1a9daa4e Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 09:55:04 +0000
Subject: [PATCH 06/15] Assert of float type

---
 cpp/basix/math.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 074f81391..aaffc79d4 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -345,8 +345,8 @@ void dot(const U& A, const V& B, W&& C)
     //                    std::layout_right>);
 
     using T = typename std::decay_t<U>::value_type;
-    // static_assert(std::is_same_v<typename std::decay_t<V>::value_type, T>);
-    // static_assert(std::is_same_v<typename std::decay_t<W>::value_type, T>);
+    static_assert(std::is_same_v<typename std::decay_t<V>::value_type, T>);
+    static_assert(std::is_same_v<typename std::decay_t<W>::value_type, T>);
     impl::dot_blas<T>(
         std::span(A.data_handle(), A.size()), {A.extent(0), A.extent(1)},
         std::span(B.data_handle(), B.size()), {B.extent(0), B.extent(1)},

From f183dacea14998c498a1eccfe551ff9b3fb6cc5b Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 10:05:54 +0000
Subject: [PATCH 07/15] Update assert

---
 cpp/basix/math.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index aaffc79d4..851214204 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -331,12 +331,14 @@ void dot(const U& A, const V& B, W&& C)
   }
   else
   {
+    // #if defined(_MSC_VER) && !defined(__clang__)
     static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
-                                 std::layout_right>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
     static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,
-                                 std::layout_right>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
     static_assert(std::is_same_v<typename std::decay_t<W>::layout_type,
-                                 std::layout_right>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
+    // #endif
 
     // static_assert(std::is_same_v<typename U::layout_type,
     // std::layout_right>); static_assert(std::is_same_v<typename

From 3c7cd28cf83f61e57d2a96a9195c08d9c6d701e1 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 10:33:24 +0000
Subject: [PATCH 08/15] Assert fix

---
 cpp/basix/math.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 851214204..81be508b2 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -333,11 +333,11 @@ void dot(const U& A, const V& B, W&& C)
   {
     // #if defined(_MSC_VER) && !defined(__clang__)
     static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
-                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,
-                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<W>::layout_type,
-                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_left>);
+                                 MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     // #endif
 
     // static_assert(std::is_same_v<typename U::layout_type,

From a9776f825394d7264f90258b15c19de7ff38274a Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 12:20:01 +0000
Subject: [PATCH 09/15] Disable static check for MSVC

---
 cpp/basix/math.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 81be508b2..b859f9572 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -331,21 +331,14 @@ void dot(const U& A, const V& B, W&& C)
   }
   else
   {
-    // #if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
     static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<W>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
-    // #endif
-
-    // static_assert(std::is_same_v<typename U::layout_type,
-    // std::layout_right>); static_assert(std::is_same_v<typename
-    // V::layout_type, std::layout_right>); static_assert(
-    //     std::is_same_v<typename std::remove_cv<typename W::layout_type>,
-    //                    std::layout_right>);
-
+#endif
     using T = typename std::decay_t<U>::value_type;
     static_assert(std::is_same_v<typename std::decay_t<V>::value_type, T>);
     static_assert(std::is_same_v<typename std::decay_t<W>::value_type, T>);

From 55422e2d38c6626861286ce04be0b884d6bfe6fb Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 13:09:58 +0000
Subject: [PATCH 10/15] Fix

---
 cpp/basix/math.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index b859f9572..c0b1aa4e8 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -331,7 +331,7 @@ void dot(const U& A, const V& B, W&& C)
   }
   else
   {
-#if defined(_MSC_VER) && !defined(__clang__)
+#ifndef _MSVC_LANG
     static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,

From ff71810ba8b114cc0b6904efabe8951d5b5c2012 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 17:46:27 +0000
Subject: [PATCH 11/15] Generalise mat-mat product

---
 cpp/basix/dof-transformations.cpp | 46 +++++++++++++++++++++----------
 cpp/basix/math.h                  | 39 ++++++++++++++++----------
 2 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/cpp/basix/dof-transformations.cpp b/cpp/basix/dof-transformations.cpp
index 03b4a77cf..706d18cfd 100644
--- a/cpp/basix/dof-transformations.cpp
+++ b/cpp/basix/dof-transformations.cpp
@@ -94,11 +94,11 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
   {
     mapinfo_t<T> mapinfo;
     auto& data = mapinfo.try_emplace(cell::type::interval).first->second;
-    auto map = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], 0.0}; };
+    auto map = [](auto pt) -> std::array<T, 3> { return {pt[1], pt[0], 0}; };
     stdex::mdarray<T, stdex::extents<std::size_t, 2, 2>> J(
         stdex::extents<std::size_t, 2, 2>{}, {0., 1., 1., 0.});
 
-    T detJ = -1.0;
+    T detJ = -1;
     stdex::mdarray<T, stdex::extents<std::size_t, 2, 2>> K(
         stdex::extents<std::size_t, 2, 2>{}, {0., 1., 1., 0.});
     data.push_back(std::tuple(map, J, detJ, K));
@@ -485,24 +485,42 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
   // Interpolate to calculate coefficients
   std::vector<T> transformb(ndofs * ndofs);
   mdspan_t<T, 2> transform(transformb.data(), ndofs, ndofs);
-  for (std::size_t d = 0; d < imat.extent(3); ++d)
+
+  std::vector<T> imat_b(transform.extent(1) * imat.extent(2));
+  mdspan_t<T, 2> imat_id(imat_b.data(), transform.extent(1), imat.extent(2));
+
+  std::vector<T> pushed_datai_b(imat.extent(2) * transform.extent(0));
+  mdspan_t<T, 2> pushed_data_i(pushed_datai_b.data(), imat.extent(2),
+                               transform.extent(0));
+
+  std::vector<T> transformT_b(transform.extent(1) * transform.extent(0));
+  mdspan_t<T, 2> transformT(transformT_b.data(), transform.extent(1),
+                            transform.extent(0));
+  for (std::size_t i = 0; i < vs; ++i)
   {
-    for (std::size_t i = 0; i < vs; ++i)
+    // Pack pushed_data
+    for (std::size_t k2 = 0; k2 < imat.extent(2); ++k2)
+      for (std::size_t k1 = 0; k1 < transform.extent(0); ++k1)
+        pushed_data_i(k2, k1) = pushed_data(k2, k1 + dofstart, i);
+
+    for (std::size_t d = 0; d < imat.extent(3); ++d)
     {
+      // Pack imat
       for (std::size_t k0 = 0; k0 < transform.extent(1); ++k0)
-      {
-        for (std::size_t k1 = 0; k1 < transform.extent(0); ++k1)
-        {
-          for (std::size_t k2 = 0; k2 < imat.extent(2); ++k2)
-          {
-            transform(k1, k0)
-                += imat(k0, i, k2, d) * pushed_data(k2, k1 + dofstart, i);
-          }
-        }
-      }
+        for (std::size_t k2 = 0; k2 < imat.extent(2); ++k2)
+          imat_id(k0, k2) = imat(k0, i, k2, d);
+
+      // transformT_(k0, k1) = imat_id_(k0, k2) pushed_data_i(k2, k1) +
+      // transformT_(k0, k1)
+      math::dot(imat_id, pushed_data_i, transformT, 1, 1);
     }
   }
 
+  // Transpose 'transformT' -> 'transform'
+  for (std::size_t k0 = 0; k0 < transform.extent(1); ++k0)
+    for (std::size_t k1 = 0; k1 < transform.extent(0); ++k1)
+      transform(k1, k0) = transformT(k0, k1);
+
   return {std::move(transformb), {transform.extent(0), transform.extent(1)}};
 }
 } // namespace
diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index c0b1aa4e8..9fe053880 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -51,14 +51,15 @@ namespace basix::math
 {
 namespace impl
 {
-/// @brief Compute C = A * B using BLAS.
+/// @brief Compute C = alpha A * B  + \beta C using BLAS (GEMM).
 /// @param[in] A Input matrix
 /// @param[in] B Input matrix
-/// @return A * B
+/// @param[in] alpha
+/// @param[in] beta
 template <std::floating_point T>
 void dot_blas(std::span<const T> A, std::array<std::size_t, 2> Ashape,
               std::span<const T> B, std::array<std::size_t, 2> Bshape,
-              std::span<T> C)
+              std::span<T> C, T alpha = 1, T beta = 0)
 {
   static_assert(std::is_same_v<T, float> or std::is_same_v<T, double>);
 
@@ -69,8 +70,6 @@ void dot_blas(std::span<const T> A, std::array<std::size_t, 2> Ashape,
   int N = Bshape[1];
   int K = Ashape[1];
 
-  T alpha = 1;
-  T beta = 0;
   int lda = K;
   int ldb = N;
   int ldc = N;
@@ -310,42 +309,52 @@ transpose_lu(std::pair<std::vector<T>, std::array<std::size_t, 2>>& A)
   return perm;
 }
 
-/// @brief Compute C = A * B.
+/// @brief Compute C = \alpha A * B + \beta C
 /// @param[in] A Input matrix
 /// @param[in] B Input matrix
 /// @param[out] C Output matrix. Must be sized correctly before calling
 /// this function.
+/// @param[in] alpha
+/// @param[in] beta
 template <typename U, typename V, typename W>
-void dot(const U& A, const V& B, W&& C)
+void dot(const U& A, const V& B, W&& C,
+         typename std::decay_t<U>::value_type alpha = 1,
+         typename std::decay_t<U>::value_type beta = 0)
 {
+  using T = typename std::decay_t<U>::value_type;
+
   assert(A.extent(1) == B.extent(0));
   assert(C.extent(0) == A.extent(0));
   assert(C.extent(1) == B.extent(1));
   if (A.extent(0) * B.extent(1) * A.extent(1) < 512)
   {
-    std::fill_n(C.data_handle(), C.extent(0) * C.extent(1), 0);
     for (std::size_t i = 0; i < A.extent(0); ++i)
+    {
       for (std::size_t j = 0; j < B.extent(1); ++j)
+      {
+        T C0 = C(i, j);
+        C(i, j) = 0;
+        T& _C = C(i, j);
         for (std::size_t k = 0; k < A.extent(1); ++k)
-          C(i, j) += A(i, k) * B(k, j);
+          _C += alpha * A(i, k) * B(k, j);
+        _C = alpha * _C + beta * C0;
+      }
+    }
   }
   else
   {
-#ifndef _MSVC_LANG
     static_assert(std::is_same_v<typename std::decay_t<U>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<V>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
     static_assert(std::is_same_v<typename std::decay_t<W>::layout_type,
                                  MDSPAN_IMPL_STANDARD_NAMESPACE::layout_right>);
-#endif
-    using T = typename std::decay_t<U>::value_type;
     static_assert(std::is_same_v<typename std::decay_t<V>::value_type, T>);
     static_assert(std::is_same_v<typename std::decay_t<W>::value_type, T>);
     impl::dot_blas<T>(
         std::span(A.data_handle(), A.size()), {A.extent(0), A.extent(1)},
         std::span(B.data_handle(), B.size()), {B.extent(0), B.extent(1)},
-        std::span(C.data_handle(), C.size()));
+        std::span(C.data_handle(), C.size()), alpha, beta);
   }
 }
 
@@ -386,8 +395,8 @@ void orthogonalise(
     norm = std::sqrt(norm);
     if (norm < 2 * std::numeric_limits<T>::epsilon())
     {
-      throw std::runtime_error(
-          "Cannot orthogonalise the rows of a matrix with incomplete row rank");
+      throw std::runtime_error("Cannot orthogonalise the rows of a matrix "
+                               "with incomplete row rank");
     }
 
     for (std::size_t k = 0; k < wcoeffs.extent(1); ++k)

From 3953b42f40b6ba2943138ca85f3d4299b55e5048 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 18:38:31 +0000
Subject: [PATCH 12/15] Generalise GEMM

---
 cpp/basix/dof-transformations.cpp | 81 +++++++++++--------------------
 cpp/basix/dof-transformations.h   | 10 ++--
 cpp/basix/math.h                  | 55 ++++++++++-----------
 3 files changed, 63 insertions(+), 83 deletions(-)

diff --git a/cpp/basix/dof-transformations.cpp b/cpp/basix/dof-transformations.cpp
index 706d18cfd..42793a841 100644
--- a/cpp/basix/dof-transformations.cpp
+++ b/cpp/basix/dof-transformations.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 Chris Richardson & Matthew Scroggs
+// Copyright (c) 2020-2024 Chris Richardson, Matthew Scroggs and Garth N. Wells
 // FEniCS Project
 // SPDX-License-Identifier:    MIT
 
@@ -13,10 +13,10 @@
 #include <span>
 #include <tuple>
 
-#include <iostream>
-
 using namespace basix;
 
+namespace
+{
 namespace stdex
     = MDSPAN_IMPL_STANDARD_NAMESPACE::MDSPAN_IMPL_PROPOSED_NAMESPACE;
 template <typename T, std::size_t d>
@@ -34,17 +34,13 @@ using map_data_t
 template <typename T>
 using mapinfo_t = std::map<cell::type, std::vector<map_data_t<T>>>;
 
-namespace
-{
 //-----------------------------------------------------------------------------
 int find_first_subentity(cell::type cell_type, cell::type entity_type)
 {
   const int edim = cell::topological_dimension(entity_type);
   std::vector<cell::type> entities = cell::subentity_types(cell_type)[edim];
   if (auto it = std::ranges::find(entities, entity_type); it != entities.end())
-  {
     return std::distance(entities.begin(), it);
-  }
   else
     throw std::runtime_error("Entity not found");
 }
@@ -373,13 +369,12 @@ mapinfo_t<T> get_mapinfo(cell::type cell_type)
 //-----------------------------------------------------------------------------
 template <std::floating_point T>
 std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
-    cell::type cell_type,
-    const std::array<std::vector<mdspan_t<const T, 2>>, 4>& x,
-    const std::array<std::vector<mdspan_t<const T, 4>>, 4>& M,
+    cell::type cell_type, std::array<std::vector<mdspan_t<const T, 2>>, 4> x,
+    std::array<std::vector<mdspan_t<const T, 4>>, 4> M,
     mdspan_t<const T, 2> coeffs, const mdarray_t<T, 2>& J, T detJ,
     const mdarray_t<T, 2>& K,
-    const std::function<std::array<T, 3>(std::span<const T>)> map_point,
-    int degree, int tdim, int entity, std::size_t vs, const maps::type map_type,
+    std::function<std::array<T, 3>(std::span<const T>)> map_point, int degree,
+    int tdim, int entity, std::size_t vs, const maps::type map_type,
     const polyset::type ptype)
 {
   if (x[tdim].size() == 0 or x[tdim][entity].extent(0) == 0)
@@ -431,34 +426,17 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
                          polyset_vals.extent(0));
   for (std::size_t j = 0; j < vs; ++j)
   {
-    // std::fill(result_b.begin(), result_b.end(), 0);
-
     for (std::size_t k0 = 0; k0 < coeffs.extent(0); ++k0)
       for (std::size_t k2 = 0; k2 < polyset_vals.extent(0); ++k2)
         _coeffs(k0, k2) = coeffs(k0, k2 + psize * j);
 
-    // R^T = coeffs * polyset_vals
-    // for (std::size_t k0 = 0; k0 < coeffs.extent(0); ++k0) // big
-    //   for (std::size_t k1 = 0; k1 < polyset_vals.extent(1); ++k1)
-    //     for (std::size_t k2 = 0; k2 < polyset_vals.extent(0); ++k2) // big
-    //       result(k1, k0) += _coeffs(k0, k2) * polyset_vals(k2, k1);
-    // result(k1, k0) += coeffs(k0, k2 + psize * j) * polyset_vals(k2, k1);
-
     // r^t: coeffs.extent(0) x polyset_vals.extent(1) [k0, k1]
     // c: coeffs.extent(1) x polyset_vals.extent(0)   [k0, k2]
     // p: polyset_vals.extent(0) x polyset_vals.extent(1)  [k2, k1]
-
     math::dot(_coeffs, polyset_vals, result);
-
-    // std::cout << "4: transform" << std::endl;
-
     for (std::size_t k0 = 0; k0 < result.extent(1); ++k0)
       for (std::size_t k1 = 0; k1 < result.extent(0); ++k1)
         tabulated_data(k0, k1, j) = result(k1, k0);
-
-    // for (std::size_t k0 = 0; k0 < result.extent(0); ++k0)
-    //   for (std::size_t k1 = 0; k1 < result.extent(1); ++k1)
-    //     tabulated_data(k0, k1, j) = result(k0, k1);
   }
 
   // push forward
@@ -471,11 +449,9 @@ std::pair<std::vector<T>, std::array<std::size_t, 2>> compute_transformation(
           tabulated_data_b.data()
               + i * tabulated_data.extent(1) * tabulated_data.extent(2),
           tabulated_data.extent(1), tabulated_data.extent(2));
-
       push_forward(map_type,
                    mdspan_t<T, 2>(temp_data.data(), temp_data.extents()), tab,
                    J, detJ, K);
-
       for (std::size_t k0 = 0; k0 < temp_data.extent(0); ++k0)
         for (std::size_t k1 = 0; k1 < temp_data.extent(1); ++k1)
           pushed_data(i, k0, k1) = temp_data(k0, k1);
@@ -529,14 +505,16 @@ template <std::floating_point T>
 std::map<cell::type, std::pair<std::vector<T>, std::array<std::size_t, 3>>>
 doftransforms::compute_entity_transformations(
     cell::type cell_type,
-    const std::array<
+    std::array<
         std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
             const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>>,
-        4>& x,
-    const std::array<
+        4>
+        x,
+    std::array<
         std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
             const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>>,
-        4>& M,
+        4>
+        M,
     MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
         const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>
         coeffs,
@@ -550,7 +528,6 @@ doftransforms::compute_entity_transformations(
     const int tdim = cell::topological_dimension(entity_type);
     const int entity = find_first_subentity(cell_type, entity_type);
     std::size_t ndofs = M[tdim].size() == 0 ? 0 : M[tdim][entity].extent(0);
-
     std::vector<T> transform;
     transform.reserve(emap_data.size() * ndofs * ndofs);
     for (auto& [mapfn, J, detJ, K] : emap_data)
@@ -574,14 +551,14 @@ template std::map<cell::type,
                   std::pair<std::vector<float>, std::array<std::size_t, 3>>>
 doftransforms::compute_entity_transformations(
     cell::type,
-    const std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-                         const float, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<
-                                          std::size_t, 2>>>,
-                     4>&,
-    const std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-                         const float, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<
-                                          std::size_t, 4>>>,
-                     4>&,
+    std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
+                   const float,
+                   MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>>,
+               4>,
+    std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
+                   const float,
+                   MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>>,
+               4>,
     MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
         const float, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>,
     int, std::size_t, maps::type, polyset::type);
@@ -590,14 +567,14 @@ template std::map<cell::type,
                   std::pair<std::vector<double>, std::array<std::size_t, 3>>>
 doftransforms::compute_entity_transformations(
     cell::type,
-    const std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-                         const double, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<
-                                           std::size_t, 2>>>,
-                     4>&,
-    const std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-                         const double, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<
-                                           std::size_t, 4>>>,
-                     4>&,
+    std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
+                   const double,
+                   MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>>,
+               4>,
+    std::array<std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
+                   const double,
+                   MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>>,
+               4>,
     MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
         const double, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>,
     int, std::size_t, maps::type, polyset::type);
diff --git a/cpp/basix/dof-transformations.h b/cpp/basix/dof-transformations.h
index 3399c7397..596194606 100644
--- a/cpp/basix/dof-transformations.h
+++ b/cpp/basix/dof-transformations.h
@@ -40,14 +40,16 @@ template <std::floating_point T>
 std::map<cell::type, std::pair<std::vector<T>, std::array<std::size_t, 3>>>
 compute_entity_transformations(
     cell::type cell_type,
-    const std::array<
+    std::array<
         std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
             const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>>,
-        4>& x,
-    const std::array<
+        4>
+        x,
+    std::array<
         std::vector<MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
             const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>>,
-        4>& M,
+        4>
+        M,
     MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
         const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>
         coeffs,
diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 9fe053880..ac6fff692 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 Igor Baratta
+// Copyright (C) 2021-2024 Igor Baratta and Garth N. Wells
 //
 // This file is part of DOLFINx (https://www.fenicsproject.org)
 //
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include "mdspan.hpp"
 #include <algorithm>
 #include <array>
 #include <cmath>
@@ -16,8 +17,6 @@
 #include <utility>
 #include <vector>
 
-#include "mdspan.hpp"
-
 extern "C"
 {
   void ssyevd_(char* jobz, char* uplo, int* n, float* a, int* lda, float* w,
@@ -45,15 +44,15 @@ extern "C"
 
 /// @brief Mathematical functions.
 ///
-/// @note The functions in this namespace are designed to be called
-/// multiple times at runtime, so their performance is critical.
+/// @note Functions in this namespace are designed to be called multiple
+/// times at runtime, so their performance is critical.
 namespace basix::math
 {
 namespace impl
 {
 /// @brief Compute C = alpha A * B  + \beta C using BLAS (GEMM).
-/// @param[in] A Input matrix
-/// @param[in] B Input matrix
+/// @param[in] A Input matrix.
+/// @param[in] B Input matrix.
 /// @param[in] alpha
 /// @param[in] beta
 template <std::floating_point T>
@@ -89,8 +88,8 @@ void dot_blas(std::span<const T> A, std::array<std::size_t, 2> Ashape,
 } // namespace impl
 
 /// @brief Compute the outer product of vectors u and v.
-/// @param u The first vector
-/// @param v The second vector
+/// @param u The first vector.
+/// @param v The second vector.
 /// @return The outer product. The type will be the same as `u`.
 template <typename U, typename V>
 std::pair<std::vector<typename U::value_type>, std::array<std::size_t, 2>>
@@ -103,7 +102,7 @@ outer(const U& u, const V& v)
   return {std::move(result), {u.size(), v.size()}};
 }
 
-/// Compute the cross product u x v
+/// @brief Compute the cross product u x v.
 /// @param u The first vector. It must has size 3.
 /// @param v The second vector. It must has size 3.
 /// @return The cross product `u x v`. The type will be the same as `u`.
@@ -116,12 +115,13 @@ std::array<typename U::value_type, 3> cross(const U& u, const V& v)
           u[0] * v[1] - u[1] * v[0]};
 }
 
-/// Compute the eigenvalues and eigenvectors of a square Hermitian matrix A
-/// @param[in] A Input matrix, row-major storage
-/// @param[in] n Number of rows
+/// @brief Compute the eigenvalues and eigenvectors of a square
+/// Hermitian matrix A.
+/// @param[in] A Input matrix, row-major storage.
+/// @param[in] n Number of rows.
 /// @return Eigenvalues (0) and eigenvectors (1). The eigenvector array
 /// uses column-major storage, which each column being an eigenvector.
-/// @pre The matrix `A` must be symmetric
+/// @pre The matrix `A` must be symmetric.
 template <std::floating_point T>
 std::pair<std::vector<T>, std::vector<T>> eigh(std::span<const T> A,
                                                std::size_t n)
@@ -179,9 +179,9 @@ std::pair<std::vector<T>, std::vector<T>> eigh(std::span<const T> A,
 }
 
 /// @brief Solve A X = B.
-/// @param[in] A The matrix
-/// @param[in] B Right-hand side matrix/vector
-/// @return A^{-1} B
+/// @param[in] A The matrix.
+/// @param[in] B Right-hand side matrix/vector.@brief
+/// @return A^{-1} B.
 template <std::floating_point T>
 std::vector<T>
 solve(MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
@@ -209,6 +209,7 @@ solve(MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
   int nrhs = _B.extent(1);
   int lda = _A.extent(0);
   int ldb = _B.extent(0);
+
   // Pivot indices that define the permutation matrix for the LU solver
   std::vector<int> piv(N);
   int info;
@@ -231,9 +232,9 @@ solve(MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
   return rb;
 }
 
-/// @brief Check if A is a singular matrix,
-/// @param[in] A The matrix
-/// @return A bool indicating if the matrix is singular
+/// @brief Check if A is a singular matrix.
+/// @param[in] A The matrix.
+/// @return A bool indicating if the matrix is singular.
 template <std::floating_point T>
 bool is_singular(
     MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
@@ -277,9 +278,9 @@ bool is_singular(
 
 /// @brief Compute the LU decomposition of the transpose of a square
 /// matrix A.
-/// @param[in,out] A The matrix
+/// @param[in,out] A The matrix.
 /// @return The LU permutation, in prepared format (see
-/// precompute::prepare_permutation)
+/// precompute::prepare_permutation).
 template <std::floating_point T>
 std::vector<std::size_t>
 transpose_lu(std::pair<std::vector<T>, std::array<std::size_t, 2>>& A)
@@ -326,7 +327,7 @@ void dot(const U& A, const V& B, W&& C,
   assert(A.extent(1) == B.extent(0));
   assert(C.extent(0) == A.extent(0));
   assert(C.extent(1) == B.extent(1));
-  if (A.extent(0) * B.extent(1) * A.extent(1) < 512)
+  if (A.extent(0) * B.extent(1) * A.extent(1) < 256)
   {
     for (std::size_t i = 0; i < A.extent(0); ++i)
     {
@@ -336,7 +337,7 @@ void dot(const U& A, const V& B, W&& C,
         C(i, j) = 0;
         T& _C = C(i, j);
         for (std::size_t k = 0; k < A.extent(1); ++k)
-          _C += alpha * A(i, k) * B(k, j);
+          _C += A(i, k) * B(k, j);
         _C = alpha * _C + beta * C0;
       }
     }
@@ -359,8 +360,8 @@ void dot(const U& A, const V& B, W&& C,
 }
 
 /// @brief Build an identity matrix.
-/// @param[in] n The number of rows/columns
-/// @return Identity matrix using row-major storage
+/// @param[in] n The number of rows/columns.
+/// @return Identity matrix using row-major storage.
 template <std::floating_point T>
 std::vector<T> eye(std::size_t n)
 {
@@ -376,7 +377,7 @@ std::vector<T> eye(std::size_t n)
 }
 
 /// @brief Orthogonalise the rows of a matrix (in place).
-/// @param[in] wcoeffs The matrix
+/// @param[in] wcoeffs The matrix.
 /// @param[in] start The row to start from. The rows before this should
 /// already be orthogonal.
 template <std::floating_point T>

From f9867c9e85bcc42b7fd14d8ee31451b1e3620aa2 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 18:52:38 +0000
Subject: [PATCH 13/15] Extend tests

---
 test/test_lagrange.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_lagrange.py b/test/test_lagrange.py
index ca0f4bc35..4c7a2caf8 100644
--- a/test/test_lagrange.py
+++ b/test/test_lagrange.py
@@ -856,7 +856,7 @@ def test_tet(degree):
         (basix.CellType.tetrahedron, basix.CellType.tetrahedron),
     ],
 )
-@pytest.mark.parametrize("degree", [1, 2, 3, 4])
+@pytest.mark.parametrize("degree", [1, 2, 3, 4, 12])
 def test_lagrange(celltype, degree):
     lagrange = basix.create_element(
         basix.ElementFamily.P, celltype[1], degree, basix.LagrangeVariant.equispaced
@@ -866,7 +866,7 @@ def test_lagrange(celltype, degree):
     assert np.isclose(np.sum(w, axis=1), 1.0).all()
 
 
-@pytest.mark.parametrize("degree", [1, 2, 3, 4])
+@pytest.mark.parametrize("degree", [1, 2, 3, 4, 12])
 def test_dof_transformations_interval(degree):
     lagrange = basix.create_element(
         basix.ElementFamily.P, basix.CellType.interval, degree, basix.LagrangeVariant.equispaced

From ab5c8fe4cc49a9e3620053cca1f8c6178d9db0f6 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 16 Nov 2024 20:58:33 +0000
Subject: [PATCH 14/15] Doc syntax fix

---
 cpp/basix/math.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index ac6fff692..8f13a2398 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -50,7 +50,7 @@ namespace basix::math
 {
 namespace impl
 {
-/// @brief Compute C = alpha A * B  + \beta C using BLAS (GEMM).
+/// @brief Compute C = alpha A * B  + beta C using BLAS (GEMM).
 /// @param[in] A Input matrix.
 /// @param[in] B Input matrix.
 /// @param[in] alpha
@@ -310,7 +310,7 @@ transpose_lu(std::pair<std::vector<T>, std::array<std::size_t, 2>>& A)
   return perm;
 }
 
-/// @brief Compute C = \alpha A * B + \beta C
+/// @brief Compute C = alpha A * B + beta C
 /// @param[in] A Input matrix
 /// @param[in] B Input matrix
 /// @param[out] C Output matrix. Must be sized correctly before calling

From 5a54b52224fb1d3e7452d7461671fc7e2159e0bb Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Mon, 18 Nov 2024 08:26:12 +0000
Subject: [PATCH 15/15] Update cpp/basix/math.h

Co-authored-by: Matthew Scroggs <matthew.w.scroggs@gmail.com>
---
 cpp/basix/math.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/basix/math.h b/cpp/basix/math.h
index 8f13a2398..8f19a42b2 100644
--- a/cpp/basix/math.h
+++ b/cpp/basix/math.h
@@ -180,7 +180,7 @@ std::pair<std::vector<T>, std::vector<T>> eigh(std::span<const T> A,
 
 /// @brief Solve A X = B.
 /// @param[in] A The matrix.
-/// @param[in] B Right-hand side matrix/vector.@brief
+/// @param[in] B Right-hand side matrix/vector.
 /// @return A^{-1} B.
 template <std::floating_point T>
 std::vector<T>